diff --git a/nbs/examples/gpt2_training.ipynb b/nbs/examples/gpt2_training.ipynb index f124a92..afb5111 100644 --- a/nbs/examples/gpt2_training.ipynb +++ b/nbs/examples/gpt2_training.ipynb @@ -18,8 +18,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "import tidygrad as tg\n", @@ -34,8 +38,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 2, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# ds = datasets.load_dataset(\"roneneldan/TinyStories\")" @@ -43,8 +51,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 3, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "n_vocab = 1024\n", @@ -56,8 +68,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 4, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def gpt2_new(n_vocab, n_layers, n_heads, ndim):\n", @@ -113,8 +129,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 5, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def gpt2_init(model):\n", @@ -133,8 +153,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [ { "data": { @@ -142,7 +166,7 @@ "28" ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -153,8 +177,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 7, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "import tidygrad.func as F" @@ -162,8 +190,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 8, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def gpt2_transformer_block(model: tg.model.Model, x, n_heads, i):\n", @@ -252,8 +284,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 9, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# res = gpt2(model, np.arange(256).reshape(2, -1), n_layers=n_layers, n_heads=n_heads)\n", @@ -262,8 +298,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 10, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# from tidygrad.training import one_hot_encode_batch" @@ -271,8 +311,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 11, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def one_hot_encode(batch, n_classes):\n", @@ -285,8 +329,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 12, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def language_modeling_loss(model, input, target, n_layers, n_heads):\n", @@ -308,8 +356,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 13, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# np.seterr(all=\"raise\")\n", @@ -321,8 +373,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 14, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# with open(\"datasets/TinyStories/TinyStories.txt\", \"r\") as file:\n", @@ -331,8 +387,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 15, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "# Dataset:\n", @@ -372,8 +432,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 16, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "import math" @@ -381,8 +445,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 17, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "class TSDataLoader(DataLoader):\n", @@ -417,8 +485,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 18, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "from tidygrad.utils.data import DataLoaders" @@ -426,8 +498,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 19, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [ { "name": "stdout", @@ -448,14 +524,22 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 20, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "from tidygrad.training import Learner\n", @@ -466,8 +550,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 21, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "import tidygrad.tensor\n" @@ -475,8 +563,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 22, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "def loss_function(X, y):\n", @@ -501,8 +593,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 23, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "from tidygrad.training import DictLoggerCallback, ProgressBarCallback, Loss" @@ -510,8 +606,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 24, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [], "source": [ "class OneBatchCallback:\n", @@ -535,8 +635,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 25, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [ { "name": "stdout", @@ -569,38 +673,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "50" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" + "execution_count": 27, + "metadata": { + "vscode": { + "languageId": "python" } - ], - "source": [ - "len(dataloader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "7bbaf7ca2a4448798396c8bf40c8d7e4", + "model_id": "7224b0db599b4af3ab9528274fcf853a", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Epoch: 0%| | 0/100 [00:00" ] @@ -650,19 +738,7 @@ "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.908, 6.953] μ=6.931 σ=0.009\n", - " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", - "post_epoch num tensors 325\n", - "layer 0\n", - "layer 1\n", - "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.859, 6.951] μ=6.925 σ=0.016\n", - " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", - "post_epoch num tensors 325\n", - "layer 0\n", - "layer 1\n", - "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.842, 6.959] μ=6.913 σ=0.029\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.907, 6.951] μ=6.931 σ=0.009\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n" ] @@ -682,212 +758,246 @@ "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.792, 6.958] μ=6.903 σ=0.036\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.857, 6.955] μ=6.923 σ=0.016\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.778, 6.972] μ=6.898 σ=0.048\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.834, 6.955] μ=6.911 σ=0.029\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.732, 7.001] μ=6.886 σ=0.058\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.784, 6.962] μ=6.901 σ=0.036\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.722, 6.983] μ=6.874 σ=0.065\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.764, 6.973] μ=6.893 σ=0.047\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.690, 6.980] μ=6.876 σ=0.072\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.735, 6.971] μ=6.885 σ=0.060\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.654, 6.980] μ=6.846 σ=0.089\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.704, 6.964] μ=6.869 σ=0.069\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.616, 7.003] μ=6.837 σ=0.088\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.670, 6.989] μ=6.873 σ=0.079\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.582, 7.011] μ=6.819 σ=0.113\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.633, 6.989] μ=6.845 σ=0.092\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.539, 7.025] μ=6.810 σ=0.116\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.598, 6.984] μ=6.832 σ=0.091\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.502, 7.034] μ=6.806 σ=0.140\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.557, 7.014] μ=6.817 σ=0.119\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.454, 7.036] μ=6.739 σ=0.152\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.515, 7.034] μ=6.804 σ=0.120\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.405, 7.077] μ=6.742 σ=0.180\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.474, 7.022] μ=6.801 σ=0.148\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.353, 7.075] μ=6.723 σ=0.188\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.432, 7.044] μ=6.734 σ=0.159\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.297, 7.135] μ=6.711 σ=0.210\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.379, 7.049] μ=6.733 σ=0.181\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.235, 7.099] μ=6.649 σ=0.229\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.327, 7.072] μ=6.711 σ=0.190\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.170, 7.114] μ=6.679 σ=0.255\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.265, 7.108] μ=6.701 σ=0.214\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.106, 7.183] μ=6.623 σ=0.291\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.203, 7.115] μ=6.640 σ=0.237\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[6.031, 7.202] μ=6.594 σ=0.319\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.140, 7.161] μ=6.672 σ=0.268\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.958, 7.233] μ=6.567 σ=0.339\n", + " v=array[64, 2, 1] f32 n=128 x∈[6.072, 7.183] μ=6.612 σ=0.295\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.881, 7.248] μ=6.526 σ=0.370\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.994, 7.174] μ=6.578 σ=0.325\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.802, 7.284] μ=6.502 σ=0.424\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.919, 7.219] μ=6.550 σ=0.345\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.719, 7.288] μ=6.453 σ=0.409\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.842, 7.261] μ=6.522 σ=0.385\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.626, 7.339] μ=6.376 σ=0.449\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.759, 7.284] μ=6.491 σ=0.430\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.531, 7.442] μ=6.382 σ=0.517\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.675, 7.331] μ=6.442 σ=0.425\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.437, 7.460] μ=6.409 σ=0.536\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.581, 7.367] μ=6.363 σ=0.465\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.338, 7.498] μ=6.301 σ=0.585\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.485, 7.441] μ=6.362 σ=0.529\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.236, 7.532] μ=6.301 σ=0.591\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.392, 7.457] μ=6.390 σ=0.548\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.130, 7.617] μ=6.231 σ=0.674\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.290, 7.479] μ=6.281 σ=0.602\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[5.022, 7.613] μ=6.274 σ=0.697\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.184, 7.561] μ=6.285 σ=0.605\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[4.912, 7.725] μ=6.229 σ=0.735\n", + " v=array[64, 2, 1] f32 n=128 x∈[5.081, 7.624] μ=6.216 σ=0.691\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[4.794, 7.797] μ=6.069 σ=0.782\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.975, 7.671] μ=6.272 σ=0.717\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[4.675, 7.856] μ=6.163 σ=0.873\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.863, 7.779] μ=6.218 σ=0.756\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n", "layer 0\n", "layer 1\n", "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", - " v=array[64, 2, 1] f32 n=128 x∈[4.557, 7.870] μ=5.978 σ=0.844\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.745, 7.847] μ=6.051 σ=0.801\n", + " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", + "post_epoch num tensors 325\n", + "layer 0\n", + "layer 1\n", + "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.626, 7.890] μ=6.156 σ=0.895\n", + " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", + "post_epoch num tensors 325\n", + "layer 0\n", + "layer 1\n", + "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.506, 7.922] μ=5.963 σ=0.867\n", + " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", + "post_epoch num tensors 325\n", + "layer 0\n", + "layer 1\n", + "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.393, 8.021] μ=6.011 σ=0.977\n", + " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", + "post_epoch num tensors 325\n", + "layer 0\n", + "layer 1\n", + "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.286, 8.091] μ=5.989 σ=0.981\n", + " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", + "post_epoch num tensors 325\n", + "layer 0\n", + "layer 1\n", + "loss Tensor[64, 2, 1](name=\"\" op=Sum parents=[]):\n", + " v=array[64, 2, 1] f32 n=128 x∈[4.162, 8.158] μ=6.038 σ=1.007\n", " ∇=array[64, 2, 1] f32 n=128 \u001b[38;2;127;127;127mall_zeros\u001b[0m\n", "post_epoch num tensors 325\n" ] } ], "source": [ - "ler.fit(epochs=100)" + "ler.fit(epochs=10)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "python" + } + }, "outputs": [ { "name": "stdout", diff --git a/nbs/examples/tinystories_prepare.ipynb b/nbs/examples/tinystories_prepare.ipynb new file mode 100644 index 0000000..b1e763e --- /dev/null +++ b/nbs/examples/tinystories_prepare.ipynb @@ -0,0 +1,135 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "skip_exec: true\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TinyStories dataset pre-processing.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download and extract the TinyStories dataset\n", + "\n", + "# !wget -c https://huggingface.co/datasets/roneneldan/TinyStories/raw/main/TinyStories_all_data.tar.gz -O datasets/TinyStories/TinyStories_all_data.tar.gz\n", + "# !cd datasets/TinyStories && tar -xvf TinyStories_all_data.tar.gz && cd ../.." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 51/51 [03:08<00:00, 3.69s/it]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "import json\n", + "import os\n", + "from pathlib import Path\n", + "import unidecode\n", + "\n", + "\n", + "TS_PATH = Path(\"datasets/TinyStories/\")\n", + "\n", + "\n", + "stories = []\n", + "\n", + "for file in tqdm(list(sorted(os.listdir(TS_PATH)))):\n", + " if file.endswith(\".json\"):\n", + " with open(TS_PATH / file, \"r\") as f:\n", + " data = json.load(f)\n", + " for d in data:\n", + " story = d[\"story\"]\n", + " if not all(ord(c) < 128 for c in story):\n", + " story = unidecode.unidecode(story)\n", + "\n", + " stories.append(story)\n", + "\n", + " # if d[\"source\"] == \"GPT-3.5\":\n", + " # gpt35_stories.append(story)\n", + " # elif d[\"source\"] == \"GPT-4\":\n", + " # gpt4_stories.append(story)\n", + "\n", + "# with open(\"gpt35_stories.txt\", \"w\") as f:\n", + "# f.write(\"\\n\".join(gpt35_stories))\n", + "\n", + "# with open(\"gpt4_stories.txt\", \"w\") as f:\n", + "# f.write(\"\\n\".join(gpt4_stories))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] + } + ], + "source": [ + "with open(TS_PATH / \"TinyStories.txt\", \"w\") as f:\n", + " f.write(\"\\n\".join(stories))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\t\\n !\"$%&\\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz|~'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/nbs/examples/tinystories_tokenizer.ipynb b/nbs/examples/tinystories_tokenizer.ipynb new file mode 100644 index 0000000..9f7c6e9 --- /dev/null +++ b/nbs/examples/tinystories_tokenizer.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "skip_exec: true\n", + "---" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tokenizers import Tokenizer\n", + "from tokenizers.models import WordPiece\n", + "from tokenizers.trainers import WordPieceTrainer\n", + "from tokenizers.pre_tokenizers import Whitespace\n", + "\n", + "tokenizer = Tokenizer(WordPiece(unk_token=\"[UNK]\"))\n", + "trainer = WordPieceTrainer(special_tokens=[\"[UNK]\", \"[SEP]\", \"[PAD]\"], vocab_size=1024)\n", + "\n", + "tokenizer.pre_tokenizer = Whitespace()\n", + "\n", + "from pathlib import Path\n", + "\n", + "TS_PATH = Path(\"datasets/TinyStories/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "res = tokenizer.train([ str(TS_PATH / \"TinyStories.txt\") ], trainer)\n", + "tokenizer.save(str(TS_PATH / \"wordpiece_1024.json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = Tokenizer.from_file(str(TS_PATH / \"wordpiece_1024.json\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[247, 988, 14, 90, 9, 346, 3, 42, 235, 430, 264, 33]\n", + "He ##llo , y ' all ! H ##ow are you ?\n" + ] + } + ], + "source": [ + "token_ids = tokenizer.encode(\"Hello, y'all! How are you?\").ids\n", + "print(token_ids)\n", + "\n", + "print(tokenizer.decode(token_ids))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"./datasets/TinyStories/TinyStories_1percent.txt\") as f:\n", + " text = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_text = tokenizer.encode(text).ids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[227, 193, 442, 430, 324, 16, 250, 449, 191, 242]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenized_text[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenized_text_np = np.array(tokenized_text).astype(np.int16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.save(\"./datasets/TinyStories/TinyStories_1percent_ids\", tokenized_text_np)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}