From 287957ed72f33bd7cdf306097da383bab0239150 Mon Sep 17 00:00:00 2001 From: Joshua David Date: Wed, 10 Jul 2024 23:57:57 -0700 Subject: [PATCH] Implement a caching mechanism for tokenized sequences --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index b13a05f..ba3176b 100644 --- a/train.py +++ b/train.py @@ -131,7 +131,8 @@ def preprocess_data(data, tokenizer, max_length, overlap): while start < len(data): end = start + max_length chunk = data[start:end] - tokenized_chunk = tokenizer.encode(chunk) + # tokenized_chunk = tokenizer.encode(chunk) + tokenized_chunk = cached_tokenize(chunk, tokenizer) # Create sliding window sequences from the tokenized chunk chunk_sequences = create_sliding_window_chunks(