|
| 1 | +# v4 pretraining dataset with v4 tokenization |
| 2 | + |
| 3 | +# Helper function to multiply number of tokens and repeats |
| 4 | +function calc() { |
| 5 | + repeat=$1; shift |
| 6 | + tokens=$1; shift |
| 7 | + path=$1; shift |
| 8 | + python - << EOF |
| 9 | +total = ${repeat} * ${tokens} |
| 10 | +if total > 0: |
| 11 | + print(f"{total} ${path}" if total > 0 else "") |
| 12 | +EOF |
| 13 | +} |
| 14 | + |
| 15 | +DATASET_ROOT="/groups/gcg51557/experiments/0212_v4-train-data/data/v20250816/tokenized" |
| 16 | + |
| 17 | +export TRAIN_DATA_PATH=( |
| 18 | + # Code datasets |
| 19 | + $(calc 8 106882005818 ${DATASET_ROOT}/code_olmo-starcoder_0000_text_document) |
| 20 | + $(calc 8 117852374347 ${DATASET_ROOT}/code_stack_0000_text_document) |
| 21 | + |
| 22 | + # English curated datasets |
| 23 | + $(calc 8 5139896520 ${DATASET_ROOT}/en_dolma-books_0000_text_document) |
| 24 | + $(calc 8 60036516798 ${DATASET_ROOT}/en_dolma-pes2o_0000_text_document) |
| 25 | + $(calc 8 82254295911 ${DATASET_ROOT}/en_dolma-reddit_0000_text_document) |
| 26 | + $(calc 8 3857521208 ${DATASET_ROOT}/en_dolma-wiki_0000_text_document) |
| 27 | + $(calc 8 1483419806 ${DATASET_ROOT}/en_dolmino-stackexchange_0000_text_document) |
| 28 | + $(calc 8 3141777 ${DATASET_ROOT}/en_gsm8k_0000_text_document) |
| 29 | + $(calc 8 8750035604 ${DATASET_ROOT}/en_mathpile_0000_text_document) |
| 30 | + $(calc 8 12977175126 ${DATASET_ROOT}/en_olmo-algebraicstack_0000_text_document) |
| 31 | + $(calc 8 21716303067 ${DATASET_ROOT}/en_olmo-arxiv_0000_text_document) |
| 32 | + $(calc 8 13171054142 ${DATASET_ROOT}/en_olmo-openwebmath_0000_text_document) |
| 33 | + $(calc 8 4746637139 ${DATASET_ROOT}/en_wiki_0000_text_document) |
| 34 | + |
| 35 | + # English fineWeb low-scored |
| 36 | + $(calc 1 102568520111 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0000_text_document) |
| 37 | + $(calc 1 102509087783 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0001_text_document) |
| 38 | + $(calc 1 100816401574 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0002_text_document) |
| 39 | + $(calc 1 100065810915 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0003_text_document) |
| 40 | + $(calc 1 99955083033 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0004_text_document) |
| 41 | + $(calc 1 100268985585 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0005_text_document) |
| 42 | + $(calc 1 98582941635 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0006_text_document) |
| 43 | + $(calc 1 102501814684 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0007_text_document) |
| 44 | + $(calc 1 87146714512 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0008_text_document) |
| 45 | + $(calc 1 102540352684 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0000_text_document) |
| 46 | + $(calc 1 101615714055 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0001_text_document) |
| 47 | + $(calc 1 100223579527 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0002_text_document) |
| 48 | + $(calc 1 99832954483 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0003_text_document) |
| 49 | + $(calc 1 100200285660 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0004_text_document) |
| 50 | + $(calc 1 98939237258 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0005_text_document) |
| 51 | + $(calc 1 102361066324 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0006_text_document) |
| 52 | + $(calc 1 100127948990 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0007_text_document) |
| 53 | + $(calc 1 51572633747 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0008_text_document) |
| 54 | + $(calc 1 102346538767 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0000_text_document) |
| 55 | + $(calc 1 100658650543 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0001_text_document) |
| 56 | + $(calc 1 99879930853 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0002_text_document) |
| 57 | + $(calc 1 100207021051 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0003_text_document) |
| 58 | + $(calc 1 99609557924 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0004_text_document) |
| 59 | + $(calc 1 101835220299 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0005_text_document) |
| 60 | + $(calc 1 99887438413 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0006_text_document) |
| 61 | + $(calc 1 91670119172 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0007_text_document) |
| 62 | + $(calc 1 101899332058 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0000_text_document) |
| 63 | + $(calc 1 100107151548 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0001_text_document) |
| 64 | + $(calc 1 100188263808 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0002_text_document) |
| 65 | + $(calc 1 100208220130 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0003_text_document) |
| 66 | + $(calc 1 101460435434 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0004_text_document) |
| 67 | + $(calc 1 99804308223 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0005_text_document) |
| 68 | + $(calc 1 99868561720 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0006_text_document) |
| 69 | + $(calc 1 28118083173 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0007_text_document) |
| 70 | + $(calc 1 101287815642 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0000_text_document) |
| 71 | + $(calc 1 100193448611 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0001_text_document) |
| 72 | + $(calc 1 100909877098 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0002_text_document) |
| 73 | + $(calc 1 100757143238 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0003_text_document) |
| 74 | + $(calc 1 99723340081 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0004_text_document) |
| 75 | + $(calc 1 99265358219 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0005_text_document) |
| 76 | + $(calc 1 15921206946 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0006_text_document) |
| 77 | + $(calc 1 101005877270 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0000_text_document) |
| 78 | + $(calc 1 100489515421 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0001_text_document) |
| 79 | + $(calc 1 101723685894 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0002_text_document) |
| 80 | + $(calc 1 100045434530 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0003_text_document) |
| 81 | + $(calc 1 99720256993 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0004_text_document) |
| 82 | + $(calc 1 98543462382 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0005_text_document) |
| 83 | + $(calc 1 7338842460 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0006_text_document) |
| 84 | + $(calc 1 100825885054 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0000_text_document) |
| 85 | + $(calc 1 101739112228 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0001_text_document) |
| 86 | + $(calc 1 100416142859 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0002_text_document) |
| 87 | + $(calc 1 99754212843 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0003_text_document) |
| 88 | + $(calc 1 100042039542 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0004_text_document) |
| 89 | + $(calc 1 45275202852 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0005_text_document) |
| 90 | + $(calc 1 101199070911 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0000_text_document) |
| 91 | + $(calc 1 101438063034 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0001_text_document) |
| 92 | + $(calc 1 99938292420 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0002_text_document) |
| 93 | + $(calc 1 99892259073 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0003_text_document) |
| 94 | + $(calc 1 88320720228 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0004_text_document) |
| 95 | + $(calc 1 101682793329 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0000_text_document) |
| 96 | + $(calc 1 100779083985 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0001_text_document) |
| 97 | + $(calc 1 99807036301 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0002_text_document) |
| 98 | + $(calc 1 100032616702 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0003_text_document) |
| 99 | + $(calc 1 34415854865 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0004_text_document) |
| 100 | + $(calc 1 101794562305 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0000_text_document) |
| 101 | + $(calc 1 100159790313 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0001_text_document) |
| 102 | + $(calc 1 100003883373 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0002_text_document) |
| 103 | + $(calc 1 56293984847 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0003_text_document) |
| 104 | + |
| 105 | + # English fineWeb high-scored |
| 106 | + $(calc 4 101909608393 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0000_text_document) |
| 107 | + $(calc 4 100251912003 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0001_text_document) |
| 108 | + $(calc 4 100024677537 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0002_text_document) |
| 109 | + $(calc 4 77617886249 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0003_text_document) |
| 110 | + $(calc 4 101494962972 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0000_text_document) |
| 111 | + $(calc 4 99989476480 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0001_text_document) |
| 112 | + $(calc 4 72610042095 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0002_text_document) |
| 113 | + $(calc 4 101424615382 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0000_text_document) |
| 114 | + $(calc 4 99977160848 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0001_text_document) |
| 115 | + $(calc 4 76067058446 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0002_text_document) |
| 116 | + $(calc 4 100987625784 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0000_text_document) |
| 117 | + $(calc 4 99272893036 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0001_text_document) |
| 118 | + $(calc 4 4407565031 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0002_text_document) |
| 119 | + $(calc 4 100765663494 ${DATASET_ROOT}/en_fineweb-rescored_score_24_0000_text_document) |
| 120 | + $(calc 4 75847299078 ${DATASET_ROOT}/en_fineweb-rescored_score_24_0001_text_document) |
| 121 | + $(calc 4 100728857165 ${DATASET_ROOT}/en_fineweb-rescored_score_25_0000_text_document) |
| 122 | + $(calc 4 73395547895 ${DATASET_ROOT}/en_fineweb-rescored_score_25_0001_text_document) |
| 123 | + $(calc 4 100460689403 ${DATASET_ROOT}/en_fineweb-rescored_score_26_0000_text_document) |
| 124 | + $(calc 4 23787080600 ${DATASET_ROOT}/en_fineweb-rescored_score_26_0001_text_document) |
| 125 | + $(calc 4 100402654509 ${DATASET_ROOT}/en_fineweb-rescored_score_27_0000_text_document) |
| 126 | + $(calc 4 18288732636 ${DATASET_ROOT}/en_fineweb-rescored_score_27_0001_text_document) |
| 127 | + $(calc 4 81686513887 ${DATASET_ROOT}/en_fineweb-rescored_score_28_0000_text_document) |
| 128 | + $(calc 4 65145918651 ${DATASET_ROOT}/en_fineweb-rescored_score_29_0000_text_document) |
| 129 | + $(calc 4 100298736946 ${DATASET_ROOT}/en_fineweb-rescored_score_30_0000_text_document) |
| 130 | + $(calc 4 67634605260 ${DATASET_ROOT}/en_fineweb-rescored_score_30_0001_text_document) |
| 131 | + |
| 132 | + # Japanese curated datasets |
| 133 | + $(calc 8 124537838 ${DATASET_ROOT}/ja_aozorabunko_0000_text_document) |
| 134 | + $(calc 8 12476129929 ${DATASET_ROOT}/ja_ceek-news_0000_text_document) |
| 135 | + $(calc 8 67690089 ${DATASET_ROOT}/ja_e-gov_0000_text_document) |
| 136 | + $(calc 8 772429478 ${DATASET_ROOT}/ja_kaken_0000_text_document) |
| 137 | + $(calc 8 673493046 ${DATASET_ROOT}/ja_kokkai-giji_0000_text_document) |
| 138 | + $(calc 8 16255530591 ${DATASET_ROOT}/ja_nwc2010_0000_text_document) |
| 139 | + $(calc 8 25862823840 ${DATASET_ROOT}/ja_nwjc_0000_text_document) |
| 140 | + $(calc 8 60813844215 ${DATASET_ROOT}/ja_patent_0000_text_document) |
| 141 | + $(calc 8 11370270531 ${DATASET_ROOT}/ja_sip-comprehensive-html_0000_text_document) |
| 142 | + $(calc 8 28352330642 ${DATASET_ROOT}/ja_sip-comprehensive-pdf-pdf2text_0000_text_document) |
| 143 | + $(calc 8 741256291 ${DATASET_ROOT}/ja_warp-html_0000_text_document) |
| 144 | + $(calc 8 9563719005 ${DATASET_ROOT}/ja_warp-pdf-e0_0000_text_document) |
| 145 | + $(calc 8 42891810821 ${DATASET_ROOT}/ja_warp-pdf-e0.2_0000_text_document) |
| 146 | + $(calc 8 1085125338 ${DATASET_ROOT}/ja_wiki_0000_text_document) |
| 147 | + |
| 148 | + # Japanese CC/FineWeb |
| 149 | + $(calc 4 49729722349 ${DATASET_ROOT}/ja_cc_0000_text_document) |
| 150 | + $(calc 4 49369321010 ${DATASET_ROOT}/ja_cc_0001_text_document) |
| 151 | + $(calc 4 49657420425 ${DATASET_ROOT}/ja_cc_0002_text_document) |
| 152 | + $(calc 4 50328833323 ${DATASET_ROOT}/ja_cc_0003_text_document) |
| 153 | + $(calc 4 18329054681 ${DATASET_ROOT}/ja_cc_0004_text_document) |
| 154 | + $(calc 4 42179433505 ${DATASET_ROOT}/ja_fineweb-2_0000_text_document) |
| 155 | + $(calc 4 42736865509 ${DATASET_ROOT}/ja_fineweb-2_0001_text_document) |
| 156 | + $(calc 4 42466190036 ${DATASET_ROOT}/ja_fineweb-2_0002_text_document) |
| 157 | + $(calc 4 42415830701 ${DATASET_ROOT}/ja_fineweb-2_0003_text_document) |
| 158 | + $(calc 4 42040441473 ${DATASET_ROOT}/ja_fineweb-2_0004_text_document) |
| 159 | + $(calc 4 4255815583 ${DATASET_ROOT}/ja_fineweb-2_0005_text_document) |
| 160 | + |
| 161 | + # Korean curated datasets |
| 162 | + $(calc 8 352074304 ${DATASET_ROOT}/ko_wiki_0000_text_document) |
| 163 | + |
| 164 | + # Korean FineWeb |
| 165 | + $(calc 1 48038910925 ${DATASET_ROOT}/ko_fineweb2_0000_text_document) |
| 166 | + |
| 167 | + # Chinese curated datasets |
| 168 | + $(calc 8 740754914 ${DATASET_ROOT}/zh_wiki_0000_text_document) |
| 169 | + |
| 170 | + # Chinese FineWeb |
| 171 | + $(calc 1 136502282670 ${DATASET_ROOT}/zh_fineweb2_0000_text_document) |
| 172 | + $(calc 1 135056311908 ${DATASET_ROOT}/zh_fineweb2_0001_text_document) |
| 173 | + $(calc 1 138369517441 ${DATASET_ROOT}/zh_fineweb2_0002_text_document) |
| 174 | + $(calc 1 145115884006 ${DATASET_ROOT}/zh_fineweb2_0003_text_document) |
| 175 | + $(calc 1 11414468604 ${DATASET_ROOT}/zh_fineweb2_0004_text_document) |
| 176 | +) |
0 commit comments