From 3adbe529a62bdb279d7b32da8800a6e28c4e2de4 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Fri, 13 Sep 2024 21:58:25 +0900 Subject: [PATCH 01/11] add training scripts for cpt-lr-scheduling exp2 --- .../exp2/sbatch.sh | 41 +++ .../exp2/train.sh | 291 ++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/sbatch.sh create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/sbatch.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/sbatch.sh new file mode 100644 index 0000000..5c8f801 --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/sbatch.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=0038_train +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +EXPERIMENT_DIR=/home/shared/experiments/0038_cpt-lr-scheduling +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + bash scripts/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh new file mode 100644 index 0000000..12c460f --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -0,0 +1,291 @@ +#!/bin/bash + +# For details about the model, see: +# https://github.com/llm-jp/model-cards/pull/23 + +set -eu -o pipefail + +# EXPERIMENT_DIR= # set by sbatch +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=2048 +FFN_HIDDEN_SIZE=7168 +NUM_LAYERS=24 +NUM_HEADS=16 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=1 +PIPELINE_PARALLEL_SIZE=1 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=512 + +LR=3e-4 +MIN_LR=3e-5 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 +# 988240 + 988240 = 1976480 +LR_WARMUP_STEPS=2000 +LR_DECAY_ITERS=1976480 +TRAIN_STEPS=$(((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS}) * 2)) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="nii-geniac-1.7B" +WANDB_JOB="train-exp2" + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --finetune \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --override-opt_param-scheduler \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ From 14acaccec4bb0649547b1a5e2536f84ef337a41a Mon Sep 17 00:00:00 2001 From: Taka008 Date: Fri, 13 Sep 2024 21:59:10 +0900 Subject: [PATCH 02/11] fix LR_DECAY_ITERS --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index 12c460f..2a9f361 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -53,7 +53,7 @@ GRAD_CLIP=1 # 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 # 988240 + 988240 = 1976480 LR_WARMUP_STEPS=2000 -LR_DECAY_ITERS=1976480 +LR_DECAY_ITERS=988240 TRAIN_STEPS=$(((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS}) * 2)) # model config From 40bc6c19adc0fbdc6f23e4399a34aa16c220b92c Mon Sep 17 00:00:00 2001 From: Taka008 Date: Fri, 13 Sep 2024 22:24:30 +0900 Subject: [PATCH 03/11] fix TRAIN_STEPS --- .../v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index 2a9f361..f453b33 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -51,10 +51,9 @@ GRAD_CLIP=1 # total number of iterations # 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 -# 988240 + 988240 = 1976480 LR_WARMUP_STEPS=2000 LR_DECAY_ITERS=988240 -TRAIN_STEPS=$(((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS}) * 2)) +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) # model config TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model From c4e7eb87790ce4562065c3229286b02d7564aea9 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Fri, 13 Sep 2024 22:37:15 +0900 Subject: [PATCH 04/11] fix LR_DECAY_ITERS --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index f453b33..ddad45f 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -52,7 +52,7 @@ GRAD_CLIP=1 # total number of iterations # 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 LR_WARMUP_STEPS=2000 -LR_DECAY_ITERS=988240 +LR_DECAY_ITERS=986240 TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) # model config From 9edb39616e238cd25f25654039c1115beb61978c Mon Sep 17 00:00:00 2001 From: Taka008 Date: Wed, 18 Sep 2024 14:53:16 +0900 Subject: [PATCH 05/11] add scripts for exp4 --- .../exp4/sbatch.sh | 41 +++ .../exp4/train.sh | 290 ++++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/sbatch.sh create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/sbatch.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/sbatch.sh new file mode 100644 index 0000000..5e7f6f6 --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/sbatch.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=0038_train +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +EXPERIMENT_DIR=/home/shared/experiments/0038_cpt-lr-scheduling +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + bash scripts/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/train.sh new file mode 100644 index 0000000..7a601fc --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp4/train.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +# For details about the model, see: +# https://github.com/llm-jp/model-cards/pull/23 + +set -eu -o pipefail + +# EXPERIMENT_DIR= # set by sbatch +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=2048 +FFN_HIDDEN_SIZE=7168 +NUM_LAYERS=24 +NUM_HEADS=16 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=1 +PIPELINE_PARALLEL_SIZE=1 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=512 + +LR=3e-5 +MIN_LR=3e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 +LR_WARMUP_STEPS=2000 +LR_DECAY_ITERS=986240 +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp4 +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="nii-geniac-1.7B" +WANDB_JOB="train-exp4" + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --finetune \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --override-opt_param-scheduler \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ From 8a7e4ccb7e3f867f7faec1f15203cb1ac0a50909 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Wed, 18 Sep 2024 21:44:32 +0900 Subject: [PATCH 06/11] add scripts for exp5 --- .../exp5/sbatch.sh | 41 +++ .../exp5/train.sh | 290 ++++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/sbatch.sh create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/sbatch.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/sbatch.sh new file mode 100644 index 0000000..9d635aa --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/sbatch.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=0038_train +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +EXPERIMENT_DIR=/home/shared/experiments/0038_cpt-lr-scheduling +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + bash scripts/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/train.sh new file mode 100644 index 0000000..ba3e3be --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp5/train.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +# For details about the model, see: +# https://github.com/llm-jp/model-cards/pull/23 + +set -eu -o pipefail + +# EXPERIMENT_DIR= # set by sbatch +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=2048 +FFN_HIDDEN_SIZE=7168 +NUM_LAYERS=24 +NUM_HEADS=16 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=1 +PIPELINE_PARALLEL_SIZE=1 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=512 + +LR=3e-5 +MIN_LR=3e-6 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 +LR_WARMUP_STEPS=0 +LR_DECAY_ITERS=988240 +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp5 +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="nii-geniac-1.7B" +WANDB_JOB="train-exp5" + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --finetune \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --override-opt_param-scheduler \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ From b0ceda5ce549c6b52997f3498aaf6521b40a2700 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Thu, 19 Sep 2024 10:05:04 +0900 Subject: [PATCH 07/11] change checkpoint directory for exp2 --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index ddad45f..b561f1b 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -58,7 +58,7 @@ TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) # model config TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model -CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp2 CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} From c43041267e12268215878358b6aa5056c930c985 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Thu, 19 Sep 2024 10:19:39 +0900 Subject: [PATCH 08/11] change checkpoint directory for exp2 --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index b561f1b..ddad45f 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -58,7 +58,7 @@ TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) # model config TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model -CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp2 +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} From 291c87585fe69b9ec9cf3cf735ed316824c8099d Mon Sep 17 00:00:00 2001 From: Taka008 Date: Wed, 27 Nov 2024 11:53:59 +0900 Subject: [PATCH 09/11] add training scripts for cpt-lr-scheduling exp6 --- .../exp6/sbatch.sh | 41 +++ .../exp6/train.sh | 290 ++++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/sbatch.sh create mode 100644 pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/sbatch.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/sbatch.sh new file mode 100644 index 0000000..2f2a6b4 --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/sbatch.sh @@ -0,0 +1,41 @@ +#!/bin/bash +#SBATCH --job-name=0038_train +#SBATCH --partition=gpu-small +#SBATCH --nodes=4 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +EXPERIMENT_DIR=/home/shared/experiments/0038_cpt-lr-scheduling +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + bash scripts/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh new file mode 100644 index 0000000..e7dddda --- /dev/null +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh @@ -0,0 +1,290 @@ +#!/bin/bash + +# For details about the model, see: +# https://github.com/llm-jp/model-cards/pull/23 + +set -eu -o pipefail + +# EXPERIMENT_DIR= # set by sbatch +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=2048 +FFN_HIDDEN_SIZE=7168 +NUM_LAYERS=24 +NUM_HEADS=16 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=1 +PIPELINE_PARALLEL_SIZE=1 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=4 +GLOBAL_BATCH_SIZE=512 + +LR=1.5e-4 +MIN_LR=1.5e-5 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 512 (batch size) = 988239.316127 -> 988240 +LR_WARMUP_STEPS=2000 +LR_DECAY_ITERS=986240 +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp6 +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="nii-geniac-1.7B" +WANDB_JOB="train-exp2" + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --finetune \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --override-opt_param-scheduler \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ From 38f5ceec357299b59e7c949c12c3435aeff0c4a9 Mon Sep 17 00:00:00 2001 From: Taka008 Date: Wed, 27 Nov 2024 11:54:18 +0900 Subject: [PATCH 10/11] change checkpoint directory for exp2 --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh index ddad45f..b561f1b 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp2/train.sh @@ -58,7 +58,7 @@ TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) # model config TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model -CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints/exp2 CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT} CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT} From 8fced0113d7c80bb5991a1066907b69996de04be Mon Sep 17 00:00:00 2001 From: Taka008 Date: Wed, 27 Nov 2024 23:55:44 +0900 Subject: [PATCH 11/11] fix WANDB_JOB for exp6 --- .../scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh index e7dddda..6535808 100644 --- a/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh +++ b/pretrain/scripts/v3-1.7b-exp2-cpt-lr-scheduling-sakura/exp6/train.sh @@ -222,7 +222,7 @@ TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000. # job name WANDB_ENTITY="llm-jp" WANDB_PROJECT="nii-geniac-1.7B" -WANDB_JOB="train-exp2" +WANDB_JOB="train-exp6" # run export NVTE_FUSED_ATTN=0