Implement a caching mechanism for tokenized sequences

jshuadvd · Jul 11, 2024 · 939aa76 · 939aa76
1 parent 99c846b
commit 939aa76
Showing 1 changed file with 19 additions and 0 deletions.
diff --git a/train.py b/train.py
@@ -17,6 +17,8 @@
 import wandb
 import os
 import logging
+import hashlib
+import pickle
 
 from evaluation import evaluate_passkey_retrieval
 
@@ -94,6 +96,23 @@ def validate_targets(targets, vocab_size):
     return True
 
 
+def cached_tokenize(text, tokenizer, cache_dir="tokenizer_cache"):
+    os.makedirs(cache_dir, exist_ok=True)
+    text_hash = hashlib.md5(text.encode()).hexdigest()
+    cache_file = os.path.join(cache_dir, f"{text_hash}.pkl")
+
+    if os.path.exists(cache_file):
+        with open(cache_file, "rb") as f:
+            return pickle.load(f)
+
+    tokenized = tokenizer.encode(text)
+
+    with open(cache_file, "wb") as f:
+        pickle.dump(tokenized, f)
+
+    return tokenized
+
+
 def preprocess_data(data, tokenizer, max_length, overlap):
     """
     Preprocess the input data by tokenizing it in chunks and creating sliding window sequences.