Implement a caching mechanism for tokenized sequences

jshuadvd · Jul 11, 2024 · 287957e · 287957e
1 parent 939aa76
commit 287957e
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/train.py b/train.py
@@ -131,7 +131,8 @@ def preprocess_data(data, tokenizer, max_length, overlap):
     while start < len(data):
         end = start + max_length
         chunk = data[start:end]
-        tokenized_chunk = tokenizer.encode(chunk)
+        # tokenized_chunk = tokenizer.encode(chunk)
+        tokenized_chunk = cached_tokenize(chunk, tokenizer)
 
         # Create sliding window sequences from the tokenized chunk
         chunk_sequences = create_sliding_window_chunks(