Skip to content

Commit

Permalink
Implement a caching mechanism for tokenized sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
jshuadvd committed Jul 11, 2024
1 parent 99c846b commit 939aa76
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import wandb
import os
import logging
import hashlib
import pickle

from evaluation import evaluate_passkey_retrieval

Expand Down Expand Up @@ -94,6 +96,23 @@ def validate_targets(targets, vocab_size):
return True


def cached_tokenize(text, tokenizer, cache_dir="tokenizer_cache"):
os.makedirs(cache_dir, exist_ok=True)
text_hash = hashlib.md5(text.encode()).hexdigest()
cache_file = os.path.join(cache_dir, f"{text_hash}.pkl")

if os.path.exists(cache_file):
with open(cache_file, "rb") as f:
return pickle.load(f)

tokenized = tokenizer.encode(text)

with open(cache_file, "wb") as f:
pickle.dump(tokenized, f)

return tokenized


def preprocess_data(data, tokenizer, max_length, overlap):
"""
Preprocess the input data by tokenizing it in chunks and creating sliding window sequences.
Expand Down

0 comments on commit 939aa76

Please sign in to comment.