Skip to content

Commit 713bf38

Browse files
authored
V4 32b a3.8b phase1 (#107)
1 parent 8ae3ee4 commit 713bf38

File tree

2 files changed

+317
-0
lines changed

2 files changed

+317
-0
lines changed

v4-32b-a3.8b-phase1/base/params.sh

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# LLM-jp v4 model 32B-A3.8B
2+
3+
ALL_PARAMS=()
4+
5+
# Model hyperparameters
6+
ALL_PARAMS+=(
7+
--num-layers 32
8+
--hidden-size 2560
9+
--moe-ffn-hidden-size 960
10+
--ffn-hidden-size 960
11+
--num-attention-heads 40
12+
--group-query-attention
13+
--num-query-groups 4
14+
--kv-channels 128
15+
16+
# NOTE(odashi): We set 4096 (not 8192) for context length to award more training steps
17+
--seq-length 4096
18+
--max-position-embeddings 4096
19+
20+
--position-embedding-type rope
21+
--rotary-base 500000
22+
--untie-embeddings-and-output-weights
23+
--swiglu
24+
--normalization RMSNorm
25+
--norm-epsilon 1e-6
26+
--disable-bias-linear
27+
--qk-layernorm
28+
)
29+
30+
# Tokenizer
31+
ALL_PARAMS+=(
32+
--tokenizer-type Llama2Tokenizer
33+
--tokenizer-model ${ENV_DIR}/src/llm-jp-tokenizer-v4/v4_alpha_1.0.model
34+
)
35+
36+
# Optimizer hyperparameters
37+
ALL_PARAMS+=(
38+
--optimizer adam
39+
--lr 4e-4
40+
--min-lr 4e-5
41+
--adam-beta1 0.9
42+
--adam-beta2 0.95
43+
--adam-eps 1e-8
44+
--clip-grad 1.0
45+
--weight-decay 0.1
46+
--init-method-std 0.006
47+
--attention-dropout 0.0
48+
--hidden-dropout 0.0
49+
)
50+
51+
# MoE
52+
ALL_PARAMS+=(
53+
--manual-gc
54+
--manual-gc-interval 10
55+
--moe-aux-loss-coeff 1e-2
56+
--moe-grouped-gemm
57+
--moe-permute-fusion
58+
--moe-router-dtype fp32
59+
--moe-router-load-balancing-type aux_loss
60+
--moe-router-topk 8
61+
--moe-token-dispatcher-type alltoall
62+
--moe-z-loss-coeff 1e-3
63+
--num-experts 128
64+
)
65+
66+
# Scheduler
67+
# At least 10_526_120 steps required to train all tokens.
68+
# ( == ceil[22_074_871_647_659 / 512 / 4096] )
69+
ALL_PARAMS+=(
70+
--train-iters 12000000
71+
--lr-warmup-iters 2000
72+
--lr-decay-iters 12000000
73+
--lr-decay-style WSD
74+
75+
# NOTE(odashi): We run stable training: don't apply decay until the last step.
76+
--lr-wsd-decay-style linear
77+
--lr-wsd-decay-iters 1
78+
79+
--eval-interval 999999999
80+
--eval-iters 0
81+
)
82+
83+
# Batch sizes
84+
ALL_PARAMS+=(
85+
--micro-batch-size 2
86+
--global-batch-size 1024
87+
)
88+
89+
# Parallelism
90+
ALL_PARAMS+=(
91+
--context-parallel-size 1
92+
--expert-model-parallel-size 4
93+
--pipeline-model-parallel-size 1
94+
--sequence-parallel
95+
--tensor-model-parallel-size 1
96+
--use-distributed-optimizer
97+
--distributed-backend nccl
98+
# NOTE(odashi): Increasing timeout is required to prepare 15.6T dataset.
99+
--distributed-timeout-minutes 120
100+
--use-mpi
101+
)
102+
103+
# Dataset
104+
ALL_PARAMS+=(
105+
--data-path ${TRAIN_DATA_PATH[@]}
106+
--data-cache-path ${TASK_DIR}/cache
107+
--split 1,0,0
108+
)
109+
110+
# Other implementation-related parameters
111+
ALL_PARAMS+=(
112+
--bf16
113+
--use-mcore-models
114+
--no-masked-softmax-fusion
115+
--use-flash-attn
116+
117+
--overlap-grad-reduce
118+
--overlap-param-gather
119+
120+
--attention-softmax-in-fp32
121+
--transformer-impl transformer_engine
122+
123+
--attention-backend flash
124+
--accumulate-allreduce-grads-in-fp32
125+
--async-save
126+
--ckpt-format torch_dist
127+
--cross-entropy-fusion-impl native
128+
--cross-entropy-loss-fusion
129+
--log-interval 1
130+
--log-throughput
131+
--moe-per-layer-logging
132+
)
133+
134+
# NOTE(odashi):
135+
# https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html#communication-overlaps-and-tuning
136+
# (Taishi) MoEの実験で試していないので不明
137+
# export NVTE_FWD_LAYERNORM_SM_MARGIN=16
138+
# export NVTE_BWD_LAYERNORM_SM_MARGIN=16
139+
140+
# (Taishi) メモリ
141+
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
# v4 pretraining dataset with v4 tokenization
2+
3+
# Helper function to multiply number of tokens and repeats
4+
function calc() {
5+
repeat=$1; shift
6+
tokens=$1; shift
7+
path=$1; shift
8+
python - << EOF
9+
total = ${repeat} * ${tokens}
10+
if total > 0:
11+
print(f"{total} ${path}" if total > 0 else "")
12+
EOF
13+
}
14+
15+
DATASET_ROOT="/groups/gcg51557/experiments/0212_v4-train-data/data/v20250816/tokenized"
16+
17+
export TRAIN_DATA_PATH=(
18+
# Code datasets
19+
$(calc 8 106882005818 ${DATASET_ROOT}/code_olmo-starcoder_0000_text_document)
20+
$(calc 8 117852374347 ${DATASET_ROOT}/code_stack_0000_text_document)
21+
22+
# English curated datasets
23+
$(calc 8 5139896520 ${DATASET_ROOT}/en_dolma-books_0000_text_document)
24+
$(calc 8 60036516798 ${DATASET_ROOT}/en_dolma-pes2o_0000_text_document)
25+
$(calc 8 82254295911 ${DATASET_ROOT}/en_dolma-reddit_0000_text_document)
26+
$(calc 8 3857521208 ${DATASET_ROOT}/en_dolma-wiki_0000_text_document)
27+
$(calc 8 1483419806 ${DATASET_ROOT}/en_dolmino-stackexchange_0000_text_document)
28+
$(calc 8 3141777 ${DATASET_ROOT}/en_gsm8k_0000_text_document)
29+
$(calc 8 8750035604 ${DATASET_ROOT}/en_mathpile_0000_text_document)
30+
$(calc 8 12977175126 ${DATASET_ROOT}/en_olmo-algebraicstack_0000_text_document)
31+
$(calc 8 21716303067 ${DATASET_ROOT}/en_olmo-arxiv_0000_text_document)
32+
$(calc 8 13171054142 ${DATASET_ROOT}/en_olmo-openwebmath_0000_text_document)
33+
$(calc 8 4746637139 ${DATASET_ROOT}/en_wiki_0000_text_document)
34+
35+
# English fineWeb low-scored
36+
$(calc 1 102568520111 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0000_text_document)
37+
$(calc 1 102509087783 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0001_text_document)
38+
$(calc 1 100816401574 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0002_text_document)
39+
$(calc 1 100065810915 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0003_text_document)
40+
$(calc 1 99955083033 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0004_text_document)
41+
$(calc 1 100268985585 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0005_text_document)
42+
$(calc 1 98582941635 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0006_text_document)
43+
$(calc 1 102501814684 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0007_text_document)
44+
$(calc 1 87146714512 ${DATASET_ROOT}/en_fineweb-rescored_score_10_0008_text_document)
45+
$(calc 1 102540352684 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0000_text_document)
46+
$(calc 1 101615714055 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0001_text_document)
47+
$(calc 1 100223579527 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0002_text_document)
48+
$(calc 1 99832954483 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0003_text_document)
49+
$(calc 1 100200285660 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0004_text_document)
50+
$(calc 1 98939237258 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0005_text_document)
51+
$(calc 1 102361066324 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0006_text_document)
52+
$(calc 1 100127948990 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0007_text_document)
53+
$(calc 1 51572633747 ${DATASET_ROOT}/en_fineweb-rescored_score_11_0008_text_document)
54+
$(calc 1 102346538767 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0000_text_document)
55+
$(calc 1 100658650543 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0001_text_document)
56+
$(calc 1 99879930853 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0002_text_document)
57+
$(calc 1 100207021051 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0003_text_document)
58+
$(calc 1 99609557924 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0004_text_document)
59+
$(calc 1 101835220299 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0005_text_document)
60+
$(calc 1 99887438413 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0006_text_document)
61+
$(calc 1 91670119172 ${DATASET_ROOT}/en_fineweb-rescored_score_12_0007_text_document)
62+
$(calc 1 101899332058 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0000_text_document)
63+
$(calc 1 100107151548 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0001_text_document)
64+
$(calc 1 100188263808 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0002_text_document)
65+
$(calc 1 100208220130 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0003_text_document)
66+
$(calc 1 101460435434 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0004_text_document)
67+
$(calc 1 99804308223 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0005_text_document)
68+
$(calc 1 99868561720 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0006_text_document)
69+
$(calc 1 28118083173 ${DATASET_ROOT}/en_fineweb-rescored_score_13_0007_text_document)
70+
$(calc 1 101287815642 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0000_text_document)
71+
$(calc 1 100193448611 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0001_text_document)
72+
$(calc 1 100909877098 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0002_text_document)
73+
$(calc 1 100757143238 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0003_text_document)
74+
$(calc 1 99723340081 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0004_text_document)
75+
$(calc 1 99265358219 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0005_text_document)
76+
$(calc 1 15921206946 ${DATASET_ROOT}/en_fineweb-rescored_score_14_0006_text_document)
77+
$(calc 1 101005877270 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0000_text_document)
78+
$(calc 1 100489515421 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0001_text_document)
79+
$(calc 1 101723685894 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0002_text_document)
80+
$(calc 1 100045434530 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0003_text_document)
81+
$(calc 1 99720256993 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0004_text_document)
82+
$(calc 1 98543462382 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0005_text_document)
83+
$(calc 1 7338842460 ${DATASET_ROOT}/en_fineweb-rescored_score_15_0006_text_document)
84+
$(calc 1 100825885054 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0000_text_document)
85+
$(calc 1 101739112228 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0001_text_document)
86+
$(calc 1 100416142859 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0002_text_document)
87+
$(calc 1 99754212843 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0003_text_document)
88+
$(calc 1 100042039542 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0004_text_document)
89+
$(calc 1 45275202852 ${DATASET_ROOT}/en_fineweb-rescored_score_16_0005_text_document)
90+
$(calc 1 101199070911 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0000_text_document)
91+
$(calc 1 101438063034 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0001_text_document)
92+
$(calc 1 99938292420 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0002_text_document)
93+
$(calc 1 99892259073 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0003_text_document)
94+
$(calc 1 88320720228 ${DATASET_ROOT}/en_fineweb-rescored_score_17_0004_text_document)
95+
$(calc 1 101682793329 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0000_text_document)
96+
$(calc 1 100779083985 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0001_text_document)
97+
$(calc 1 99807036301 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0002_text_document)
98+
$(calc 1 100032616702 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0003_text_document)
99+
$(calc 1 34415854865 ${DATASET_ROOT}/en_fineweb-rescored_score_18_0004_text_document)
100+
$(calc 1 101794562305 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0000_text_document)
101+
$(calc 1 100159790313 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0001_text_document)
102+
$(calc 1 100003883373 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0002_text_document)
103+
$(calc 1 56293984847 ${DATASET_ROOT}/en_fineweb-rescored_score_19_0003_text_document)
104+
105+
# English fineWeb high-scored
106+
$(calc 4 101909608393 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0000_text_document)
107+
$(calc 4 100251912003 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0001_text_document)
108+
$(calc 4 100024677537 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0002_text_document)
109+
$(calc 4 77617886249 ${DATASET_ROOT}/en_fineweb-rescored_score_20_0003_text_document)
110+
$(calc 4 101494962972 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0000_text_document)
111+
$(calc 4 99989476480 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0001_text_document)
112+
$(calc 4 72610042095 ${DATASET_ROOT}/en_fineweb-rescored_score_21_0002_text_document)
113+
$(calc 4 101424615382 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0000_text_document)
114+
$(calc 4 99977160848 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0001_text_document)
115+
$(calc 4 76067058446 ${DATASET_ROOT}/en_fineweb-rescored_score_22_0002_text_document)
116+
$(calc 4 100987625784 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0000_text_document)
117+
$(calc 4 99272893036 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0001_text_document)
118+
$(calc 4 4407565031 ${DATASET_ROOT}/en_fineweb-rescored_score_23_0002_text_document)
119+
$(calc 4 100765663494 ${DATASET_ROOT}/en_fineweb-rescored_score_24_0000_text_document)
120+
$(calc 4 75847299078 ${DATASET_ROOT}/en_fineweb-rescored_score_24_0001_text_document)
121+
$(calc 4 100728857165 ${DATASET_ROOT}/en_fineweb-rescored_score_25_0000_text_document)
122+
$(calc 4 73395547895 ${DATASET_ROOT}/en_fineweb-rescored_score_25_0001_text_document)
123+
$(calc 4 100460689403 ${DATASET_ROOT}/en_fineweb-rescored_score_26_0000_text_document)
124+
$(calc 4 23787080600 ${DATASET_ROOT}/en_fineweb-rescored_score_26_0001_text_document)
125+
$(calc 4 100402654509 ${DATASET_ROOT}/en_fineweb-rescored_score_27_0000_text_document)
126+
$(calc 4 18288732636 ${DATASET_ROOT}/en_fineweb-rescored_score_27_0001_text_document)
127+
$(calc 4 81686513887 ${DATASET_ROOT}/en_fineweb-rescored_score_28_0000_text_document)
128+
$(calc 4 65145918651 ${DATASET_ROOT}/en_fineweb-rescored_score_29_0000_text_document)
129+
$(calc 4 100298736946 ${DATASET_ROOT}/en_fineweb-rescored_score_30_0000_text_document)
130+
$(calc 4 67634605260 ${DATASET_ROOT}/en_fineweb-rescored_score_30_0001_text_document)
131+
132+
# Japanese curated datasets
133+
$(calc 8 124537838 ${DATASET_ROOT}/ja_aozorabunko_0000_text_document)
134+
$(calc 8 12476129929 ${DATASET_ROOT}/ja_ceek-news_0000_text_document)
135+
$(calc 8 67690089 ${DATASET_ROOT}/ja_e-gov_0000_text_document)
136+
$(calc 8 772429478 ${DATASET_ROOT}/ja_kaken_0000_text_document)
137+
$(calc 8 673493046 ${DATASET_ROOT}/ja_kokkai-giji_0000_text_document)
138+
$(calc 8 16255530591 ${DATASET_ROOT}/ja_nwc2010_0000_text_document)
139+
$(calc 8 25862823840 ${DATASET_ROOT}/ja_nwjc_0000_text_document)
140+
$(calc 8 60813844215 ${DATASET_ROOT}/ja_patent_0000_text_document)
141+
$(calc 8 11370270531 ${DATASET_ROOT}/ja_sip-comprehensive-html_0000_text_document)
142+
$(calc 8 28352330642 ${DATASET_ROOT}/ja_sip-comprehensive-pdf-pdf2text_0000_text_document)
143+
$(calc 8 741256291 ${DATASET_ROOT}/ja_warp-html_0000_text_document)
144+
$(calc 8 9563719005 ${DATASET_ROOT}/ja_warp-pdf-e0_0000_text_document)
145+
$(calc 8 42891810821 ${DATASET_ROOT}/ja_warp-pdf-e0.2_0000_text_document)
146+
$(calc 8 1085125338 ${DATASET_ROOT}/ja_wiki_0000_text_document)
147+
148+
# Japanese CC/FineWeb
149+
$(calc 4 49729722349 ${DATASET_ROOT}/ja_cc_0000_text_document)
150+
$(calc 4 49369321010 ${DATASET_ROOT}/ja_cc_0001_text_document)
151+
$(calc 4 49657420425 ${DATASET_ROOT}/ja_cc_0002_text_document)
152+
$(calc 4 50328833323 ${DATASET_ROOT}/ja_cc_0003_text_document)
153+
$(calc 4 18329054681 ${DATASET_ROOT}/ja_cc_0004_text_document)
154+
$(calc 4 42179433505 ${DATASET_ROOT}/ja_fineweb-2_0000_text_document)
155+
$(calc 4 42736865509 ${DATASET_ROOT}/ja_fineweb-2_0001_text_document)
156+
$(calc 4 42466190036 ${DATASET_ROOT}/ja_fineweb-2_0002_text_document)
157+
$(calc 4 42415830701 ${DATASET_ROOT}/ja_fineweb-2_0003_text_document)
158+
$(calc 4 42040441473 ${DATASET_ROOT}/ja_fineweb-2_0004_text_document)
159+
$(calc 4 4255815583 ${DATASET_ROOT}/ja_fineweb-2_0005_text_document)
160+
161+
# Korean curated datasets
162+
$(calc 8 352074304 ${DATASET_ROOT}/ko_wiki_0000_text_document)
163+
164+
# Korean FineWeb
165+
$(calc 1 48038910925 ${DATASET_ROOT}/ko_fineweb2_0000_text_document)
166+
167+
# Chinese curated datasets
168+
$(calc 8 740754914 ${DATASET_ROOT}/zh_wiki_0000_text_document)
169+
170+
# Chinese FineWeb
171+
$(calc 1 136502282670 ${DATASET_ROOT}/zh_fineweb2_0000_text_document)
172+
$(calc 1 135056311908 ${DATASET_ROOT}/zh_fineweb2_0001_text_document)
173+
$(calc 1 138369517441 ${DATASET_ROOT}/zh_fineweb2_0002_text_document)
174+
$(calc 1 145115884006 ${DATASET_ROOT}/zh_fineweb2_0003_text_document)
175+
$(calc 1 11414468604 ${DATASET_ROOT}/zh_fineweb2_0004_text_document)
176+
)

0 commit comments

Comments
 (0)