Skip to content

Commit

Permalink
Update the training notebook with the latest training updates
Browse files Browse the repository at this point in the history
  • Loading branch information
jshuadvd committed Jul 15, 2024
1 parent 9847267 commit 324b811
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion notebooks/01_LongRoPE_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"import pickle\n",
"import GPUtil\n",
"\n",
"\n",
"from evaluation import evaluate_passkey_retrieval\n",
"\n",
"reload(src.main)"
Expand Down Expand Up @@ -59,6 +58,24 @@
"metadata": {},
"outputs": [],
"source": [
"# Streaming dataset for long sequences\n",
"class StreamingDataset(IterableDataset):\n",
" def __init__(self, dataset, tokenizer, max_length, overlap):\n",
" self.dataset = dataset\n",
" self.tokenizer = tokenizer\n",
" self.max_length = max_length\n",
" self.overlap = overlap\n",
"\n",
" def __iter__(self):\n",
" for item in self.dataset:\n",
" text = item[\"text\"]\n",
" sequences = preprocess_data(\n",
" text, self.tokenizer, self.max_length, self.overlap\n",
" )\n",
" for seq in sequences:\n",
" yield seq, seq[1:] + [self.tokenizer.eos_token_id]\n",
"\n",
"\n",
"class CustomDataset(Dataset):\n",
" \"\"\"Custom dataset for handling sequences and targets.\"\"\"\n",
"\n",
Expand Down

0 comments on commit 324b811

Please sign in to comment.