From 287957ed72f33bd7cdf306097da383bab0239150 Mon Sep 17 00:00:00 2001
From: Joshua David <jshuadvd@icloud.com>
Date: Wed, 10 Jul 2024 23:57:57 -0700
Subject: [PATCH] Implement a caching mechanism for tokenized sequences

---
 train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index b13a05f..ba3176b 100644
--- a/train.py
+++ b/train.py
@@ -131,7 +131,8 @@ def preprocess_data(data, tokenizer, max_length, overlap):
     while start < len(data):
         end = start + max_length
         chunk = data[start:end]
-        tokenized_chunk = tokenizer.encode(chunk)
+        # tokenized_chunk = tokenizer.encode(chunk)
+        tokenized_chunk = cached_tokenize(chunk, tokenizer)
 
         # Create sliding window sequences from the tokenized chunk
         chunk_sequences = create_sliding_window_chunks(