From 67acdc493b82a1d7da66bed475d3885a3bc97231 Mon Sep 17 00:00:00 2001 From: odashi Date: Wed, 11 Sep 2024 16:27:19 +0900 Subject: [PATCH 1/6] add scripts --- pretrain/scripts/fp8-behavior-check/README.md | 7 + .../scripts/fp8-behavior-check/convert.sh | 51 +++ .../scripts/fp8-behavior-check/run_convert.sh | 11 + .../scripts/fp8-behavior-check/run_eval.sh | 19 + .../scripts/fp8-behavior-check/run_train.sh | 61 ++++ .../scripts/fp8-behavior-check/sbatch_3.8b.sh | 70 ++++ .../scripts/fp8-behavior-check/train_3.8b.sh | 334 ++++++++++++++++++ 7 files changed, 553 insertions(+) create mode 100644 pretrain/scripts/fp8-behavior-check/README.md create mode 100644 pretrain/scripts/fp8-behavior-check/convert.sh create mode 100644 pretrain/scripts/fp8-behavior-check/run_convert.sh create mode 100644 pretrain/scripts/fp8-behavior-check/run_eval.sh create mode 100644 pretrain/scripts/fp8-behavior-check/run_train.sh create mode 100644 pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh create mode 100644 pretrain/scripts/fp8-behavior-check/train_3.8b.sh diff --git a/pretrain/scripts/fp8-behavior-check/README.md b/pretrain/scripts/fp8-behavior-check/README.md new file mode 100644 index 0000000..dd10653 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/README.md @@ -0,0 +1,7 @@ +# FP8 check scripts + +This directory contains several scripts to check the behavior of FP8 operations on Megatron-LM with existing checkpoints. + +* `run_train.sh`: Runs cont'd training with several configurations +* `run_convert.sh`: Runs model conversion to Hugging Face format +* `run_eval.sh`: Runs llm-jp-eval 1.3.1 evaluation diff --git a/pretrain/scripts/fp8-behavior-check/convert.sh b/pretrain/scripts/fp8-behavior-check/convert.sh new file mode 100644 index 0000000..0beb7bb --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/convert.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Model conversion script for FP8 experiment. +# Usage: +# sbatch /path/to/convert.sh SRC_DIR DEST_DIR +# +#SBATCH --job-name=0031_convert +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=200G +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +if [ $# -ne 2 ]; then + >&2 echo "Usage: $0 SRC_DIR DEST_DIR" + exit 1 +fi + +SRC_DIR=$1; shift +DEST_DIR=$1; shift + +if [ -e ${DEST_DIR} ]; then + >&2 echo "DEST_DIR already exists: ${DEST_DIR}" + exit 1 +fi + +ENV_DIR=environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +TOKENIZER_MODEL_DIR=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.0/llm-jp-tokenizer-100k.ver3.0b2 + +python ${ENV_DIR}/src/Megatron-LM/tools/checkpoint/convert.py \ + --model-type GPT \ + --loader mcore \ + --saver llama2_hf \ + --load-dir ${SRC_DIR} \ + --save-dir ${DEST_DIR} \ + --hf-tokenizer-path ${TOKENIZER_MODEL_DIR} \ + --save-dtype bfloat16 \ + --loader-transformer-impl "transformer_engine" \ + --megatron-path ${ENV_DIR}/src/Megatron-LM + +cp ${TOKENIZER_MODEL_DIR}/* ${DEST_DIR} + +echo "Done" diff --git a/pretrain/scripts/fp8-behavior-check/run_convert.sh b/pretrain/scripts/fp8-behavior-check/run_convert.sh new file mode 100644 index 0000000..3842736 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/run_convert.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +for d in $(ls checkpoints/3.8b); do + echo $d + sbatch \ + --partition=gpu-small \ + scripts/pretrain/scripts/fp8-behavior-check/convert.sh \ + checkpoints/3.8b/$d \ + checkpoints_hf/3.8b/$d +done + diff --git a/pretrain/scripts/fp8-behavior-check/run_eval.sh b/pretrain/scripts/fp8-behavior-check/run_eval.sh new file mode 100644 index 0000000..6eb183c --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/run_eval.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +CHECKPOINTS_DIR=checkpoints_hf/3.8b + +mkdir -p processed + +for d in $(ls ${CHECKPOINTS_DIR}); do + if [[ -f processed/$d ]]; then + echo "$d: already processed" + continue + fi + sbatch \ + --partition=gpu-small \ + --priority=1 \ + eval_environment/run_llm-jp-eval.sh ${CHECKPOINTS_DIR}/$d $d \ + && touch processed/$d \ + && echo "$d: queued" +done + diff --git a/pretrain/scripts/fp8-behavior-check/run_train.sh b/pretrain/scripts/fp8-behavior-check/run_train.sh new file mode 100644 index 0000000..4320f60 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/run_train.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +run_job() { + echo $@ + sbatch \ + --partition=gpu-small \ + --nodes=1 \ + scripts/pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh \ + $@ +} + +# arg order: enabled, format, margin, interval, history, algo, wgrad, iter + +# All runs are commented out for safety. + +#run_job false hybrid 0 1 1 most_recent true 0 1000 +#run_job false hybrid 0 1 1 most_recent true 2000 3000 +#run_job false hybrid 0 1 1 most_recent true 20000 21000 +#run_job false hybrid 0 1 1 most_recent true 200000 201000 + +#run_job true hybrid 0 1 1 most_recent true 0 1000 +#run_job true hybrid 0 1 1 most_recent true 2000 3000 +#run_job true hybrid 0 1 1 most_recent true 20000 21000 +#run_job true hybrid 0 1 1 most_recent true 200000 201000 + +#run_job true e3m4 0 1 1 most_recent true 200000 201000 + +#run_job true hybrid 1 1 1 most_recent true 200000 201000 +#run_job true hybrid 2 1 1 most_recent true 200000 201000 +#run_job true hybrid 3 1 1 most_recent true 200000 201000 +#run_job true hybrid 4 1 1 most_recent true 200000 201000 +#run_job true hybrid 5 1 1 most_recent true 200000 201000 +#run_job true hybrid 6 1 1 most_recent true 200000 201000 +#run_job true hybrid 7 1 1 most_recent true 200000 201000 +#run_job true hybrid 8 1 1 most_recent true 200000 201000 +#run_job true hybrid 16 1 1 most_recent true 200000 201000 +#run_job true hybrid 32 1 1 most_recent true 200000 201000 +#run_job true hybrid 64 1 1 most_recent true 200000 201000 +#run_job true hybrid 128 1 1 most_recent true 200000 201000 +#run_job true hybrid 256 1 1 most_recent true 200000 201000 + +#run_job true hybrid 0 2 1 most_recent true 200000 201000 +#run_job true hybrid 0 4 1 most_recent true 200000 201000 +#run_job true hybrid 0 8 1 most_recent true 200000 201000 +#run_job true hybrid 0 16 1 most_recent true 200000 201000 +#run_job true hybrid 0 32 1 most_recent true 200000 201000 +#run_job true hybrid 0 64 1 most_recent true 200000 201000 +#run_job true hybrid 0 128 1 most_recent true 200000 201000 +#run_job true hybrid 0 256 1 most_recent true 200000 201000 + +#run_job true hybrid 0 1 2 max true 200000 201000 +#run_job true hybrid 0 1 4 max true 200000 201000 +#run_job true hybrid 0 1 8 max true 200000 201000 +#run_job true hybrid 0 1 16 max true 200000 201000 +#run_job true hybrid 0 1 32 max true 200000 201000 +#run_job true hybrid 0 1 64 max true 200000 201000 +#run_job true hybrid 0 1 128 max true 200000 201000 +#run_job true hybrid 0 1 256 max true 200000 201000 + +#run_job true hybrid 0 1 1 most_recent false 200000 201000 + diff --git a/pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh b/pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh new file mode 100644 index 0000000..89a5872 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh @@ -0,0 +1,70 @@ +#!/bin/bash +#SBATCH --job-name=0031_train +#SBATCH --partition={partition} +#SBATCH --nodes=1 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +# PLEASE run this script from the root of the experiment directory. + + +set -eu -o pipefail + +if [ $# -ne 9 ]; then + >&2 echo "Usage $0 ENABLED FORMAT MARGIN INTERVAL AMAX_HIST_LEN AMAX_ALGO WGRAD ITER STOP" + exit 1 +fi + +FP8_ENABLED=$1; shift +FP8_FORMAT=$1; shift +FP8_MARGIN=$1; shift +FP8_INTERVAL=$1; shift +FP8_AMAX_HISTORY_LEN=$1; shift +FP8_AMAX_COMPUTE_ALGO=$1; shift +FP8_WGRAD=$1; shift +LOAD_ITER=$1; shift +FORCE_STOP_ITER=$1; shift + +EXPERIMENT_DIR=$(pwd) +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + \ + -x FP8_ENABLED=$FP8_ENABLED \ + -x FP8_FORMAT=$FP8_FORMAT \ + -x FP8_MARGIN=$FP8_MARGIN \ + -x FP8_INTERVAL=$FP8_INTERVAL \ + -x FP8_AMAX_HISTORY_LEN=$FP8_AMAX_HISTORY_LEN \ + -x FP8_AMAX_COMPUTE_ALGO=$FP8_AMAX_COMPUTE_ALGO \ + -x FP8_WGRAD=$FP8_WGRAD \ + -x LOAD_ITER=$LOAD_ITER \ + -x FORCE_STOP_ITER=${FORCE_STOP_ITER} \ + \ + bash scripts/pretrain/scripts/fp8-behavior-check/train_3.8b.sh diff --git a/pretrain/scripts/fp8-behavior-check/train_3.8b.sh b/pretrain/scripts/fp8-behavior-check/train_3.8b.sh new file mode 100644 index 0000000..8ed21f1 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/train_3.8b.sh @@ -0,0 +1,334 @@ +#!/bin/bash + +set -eu -o pipefail + +# EXPERIMENT_DIR= # sbatch option: /path/to/0031_fp8-behavior-check + +# FP8_ENABLED= # sbatch option: true, false +# FP8_FORMAT= # sbatch option: e4m3, hybrid +# FP8_MARGIN= # sbatch option: 0, 1, ... +# FP8_INTERVAL= # sbatch option: 1, 2, ... +# FP8_AMAX_HISTORY_LEN= # sbatch option: 1, 2, ... +# FP8_AMAX_COMPUTE_ALGO= # sbatch option: most_recent, max +# FP8_WGRAD= # sbatch option: true, false +# LOAD_ITER= # sbatch option: 1000 +# FORCE_STOP_ITER= # sbatch option: 2000 + +FP8_OPTIONS=() + +if ${FP8_ENABLED}; then + FP8_OPTIONS+=( + --fp8-format ${FP8_FORMAT} + --fp8-margin ${FP8_MARGIN} + --fp8-interval ${FP8_INTERVAL} + --fp8-amax-history-len ${FP8_AMAX_HISTORY_LEN} + --fp8-amax-compute-algo ${FP8_AMAX_COMPUTE_ALGO} + ) + SAVE_SUFFIX=fp8.${FP8_FORMAT}.m${FP8_MARGIN}.i${FP8_INTERVAL}.h${FP8_AMAX_HISTORY_LEN}.${FP8_AMAX_COMPUTE_ALGO} + + if $(${FP8_WGRAD}); then + SAVE_SUFFIX=${SAVE_SUFFIX}.wgrad + else + FP8_OPTIONS+=(--no-fp8-wgrad) + SAVE_SUFFIX=${SAVE_SUFFIX}.no_wgrad + fi +else + SAVE_SUFFIX=bf16 +fi + +LOAD_ITER_FMT=$(printf %07d ${LOAD_ITER}) +SAVE_REL=contd_${LOAD_ITER_FMT}.${SAVE_SUFFIX} + +if [ ${LOAD_ITER} -eq 0 ]; then + LOAD_REL=${SAVE_REL} +else + LOAD_REL=base_${LOAD_ITER_FMT} +fi + +echo FP8 options: ${FP8_OPTIONS[@]} +echo LOAD_REL: ${LOAD_REL} +echo SAVE_REL: ${SAVE_REL} + + +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache/${SAVE_REL} + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=3072 +FFN_HIDDEN_SIZE=8192 +NUM_LAYERS=28 +NUM_HEADS=24 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=1 +PIPELINE_PARALLEL_SIZE=1 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=1024 + +LR=3e-4 +MIN_LR=3e-5 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 1024 (batch size) = 494119.65806365 -> 494120 +LR_WARMUP_STEPS=2000 +LR_DECAY_ITERS=492120 +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT}/3.8b/${LOAD_REL} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT}/3.8b/${SAVE_REL} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="0031_fp8-behavior-check" +WANDB_JOB=${SAVE_REL} + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + ${FP8_OPTIONS[@]} \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ + --force-stop-iter ${FORCE_STOP_ITER} From 7a9e841de9a2c9cdd2a4e178d900498de245b9ab Mon Sep 17 00:00:00 2001 From: odashi Date: Wed, 11 Sep 2024 16:32:15 +0900 Subject: [PATCH 2/6] add content to readme --- pretrain/scripts/fp8-behavior-check/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pretrain/scripts/fp8-behavior-check/README.md b/pretrain/scripts/fp8-behavior-check/README.md index dd10653..c985674 100644 --- a/pretrain/scripts/fp8-behavior-check/README.md +++ b/pretrain/scripts/fp8-behavior-check/README.md @@ -2,6 +2,18 @@ This directory contains several scripts to check the behavior of FP8 operations on Megatron-LM with existing checkpoints. +## Prerequisites + +The following directories and contents must be exist before running scripts in this directory: + +* `checkpoints/3.8b/base_{iter:07d}`: Megatron-LM checkpoints of the base models +* `environment`: Training environment created by the installer +* `eval_environment`: Evaluation environment created by the installer +* `outputs`: Slurm log directory +* `scripts`: Clone of this repository + +## Scripts + * `run_train.sh`: Runs cont'd training with several configurations * `run_convert.sh`: Runs model conversion to Hugging Face format * `run_eval.sh`: Runs llm-jp-eval 1.3.1 evaluation From 8c0b84b2580fe271ce148909c2be9587f96c707d Mon Sep 17 00:00:00 2001 From: odashi Date: Wed, 11 Sep 2024 16:39:12 +0900 Subject: [PATCH 3/6] fix --- pretrain/scripts/fp8-behavior-check/README.md | 2 ++ pretrain/scripts/fp8-behavior-check/run_convert.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pretrain/scripts/fp8-behavior-check/README.md b/pretrain/scripts/fp8-behavior-check/README.md index c985674..d21b665 100644 --- a/pretrain/scripts/fp8-behavior-check/README.md +++ b/pretrain/scripts/fp8-behavior-check/README.md @@ -14,6 +14,8 @@ The following directories and contents must be exist before running scripts in t ## Scripts +All scripts must be invoked from the root of the experiment directory. + * `run_train.sh`: Runs cont'd training with several configurations * `run_convert.sh`: Runs model conversion to Hugging Face format * `run_eval.sh`: Runs llm-jp-eval 1.3.1 evaluation diff --git a/pretrain/scripts/fp8-behavior-check/run_convert.sh b/pretrain/scripts/fp8-behavior-check/run_convert.sh index 3842736..d779151 100644 --- a/pretrain/scripts/fp8-behavior-check/run_convert.sh +++ b/pretrain/scripts/fp8-behavior-check/run_convert.sh @@ -1,5 +1,7 @@ #!/bin/bash +mkdir -p checkpoints_hf + for d in $(ls checkpoints/3.8b); do echo $d sbatch \ From 162c58050c6c31afa39fb494e1e2086fd0cb3b37 Mon Sep 17 00:00:00 2001 From: odashi Date: Wed, 18 Sep 2024 17:04:49 +0900 Subject: [PATCH 4/6] add 13b script --- .../scripts/fp8-behavior-check/run_train.sh | 97 ++--- .../scripts/fp8-behavior-check/sbatch_13b.sh | 70 ++++ .../scripts/fp8-behavior-check/train_13b.sh | 336 ++++++++++++++++++ 3 files changed, 455 insertions(+), 48 deletions(-) create mode 100644 pretrain/scripts/fp8-behavior-check/sbatch_13b.sh create mode 100644 pretrain/scripts/fp8-behavior-check/train_13b.sh diff --git a/pretrain/scripts/fp8-behavior-check/run_train.sh b/pretrain/scripts/fp8-behavior-check/run_train.sh index 4320f60..c366e98 100644 --- a/pretrain/scripts/fp8-behavior-check/run_train.sh +++ b/pretrain/scripts/fp8-behavior-check/run_train.sh @@ -2,10 +2,10 @@ run_job() { echo $@ + PARAM_SIZE=$1; shift sbatch \ --partition=gpu-small \ - --nodes=1 \ - scripts/pretrain/scripts/fp8-behavior-check/sbatch_3.8b.sh \ + scripts/pretrain/scripts/fp8-behavior-check/sbatch_${PARAM_SIZE}.sh \ $@ } @@ -13,49 +13,50 @@ run_job() { # All runs are commented out for safety. -#run_job false hybrid 0 1 1 most_recent true 0 1000 -#run_job false hybrid 0 1 1 most_recent true 2000 3000 -#run_job false hybrid 0 1 1 most_recent true 20000 21000 -#run_job false hybrid 0 1 1 most_recent true 200000 201000 - -#run_job true hybrid 0 1 1 most_recent true 0 1000 -#run_job true hybrid 0 1 1 most_recent true 2000 3000 -#run_job true hybrid 0 1 1 most_recent true 20000 21000 -#run_job true hybrid 0 1 1 most_recent true 200000 201000 - -#run_job true e3m4 0 1 1 most_recent true 200000 201000 - -#run_job true hybrid 1 1 1 most_recent true 200000 201000 -#run_job true hybrid 2 1 1 most_recent true 200000 201000 -#run_job true hybrid 3 1 1 most_recent true 200000 201000 -#run_job true hybrid 4 1 1 most_recent true 200000 201000 -#run_job true hybrid 5 1 1 most_recent true 200000 201000 -#run_job true hybrid 6 1 1 most_recent true 200000 201000 -#run_job true hybrid 7 1 1 most_recent true 200000 201000 -#run_job true hybrid 8 1 1 most_recent true 200000 201000 -#run_job true hybrid 16 1 1 most_recent true 200000 201000 -#run_job true hybrid 32 1 1 most_recent true 200000 201000 -#run_job true hybrid 64 1 1 most_recent true 200000 201000 -#run_job true hybrid 128 1 1 most_recent true 200000 201000 -#run_job true hybrid 256 1 1 most_recent true 200000 201000 - -#run_job true hybrid 0 2 1 most_recent true 200000 201000 -#run_job true hybrid 0 4 1 most_recent true 200000 201000 -#run_job true hybrid 0 8 1 most_recent true 200000 201000 -#run_job true hybrid 0 16 1 most_recent true 200000 201000 -#run_job true hybrid 0 32 1 most_recent true 200000 201000 -#run_job true hybrid 0 64 1 most_recent true 200000 201000 -#run_job true hybrid 0 128 1 most_recent true 200000 201000 -#run_job true hybrid 0 256 1 most_recent true 200000 201000 - -#run_job true hybrid 0 1 2 max true 200000 201000 -#run_job true hybrid 0 1 4 max true 200000 201000 -#run_job true hybrid 0 1 8 max true 200000 201000 -#run_job true hybrid 0 1 16 max true 200000 201000 -#run_job true hybrid 0 1 32 max true 200000 201000 -#run_job true hybrid 0 1 64 max true 200000 201000 -#run_job true hybrid 0 1 128 max true 200000 201000 -#run_job true hybrid 0 1 256 max true 200000 201000 - -#run_job true hybrid 0 1 1 most_recent false 200000 201000 - +#run_job 3.8b false hybrid 0 1 1 most_recent true 0 1000 +#run_job 3.8b false hybrid 0 1 1 most_recent true 2000 3000 +#run_job 3.8b false hybrid 0 1 1 most_recent true 20000 21000 +#run_job 3.8b false hybrid 0 1 1 most_recent true 200000 201000 + +#run_job 3.8b true hybrid 0 1 1 most_recent true 0 1000 +#run_job 3.8b true hybrid 0 1 1 most_recent true 2000 3000 +#run_job 3.8b true hybrid 0 1 1 most_recent true 20000 21000 +#run_job 3.8b true hybrid 0 1 1 most_recent true 200000 201000 + +#run_job 3.8b true e3m4 0 1 1 most_recent true 200000 201000 + +#run_job 3.8b true hybrid 1 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 2 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 3 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 4 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 5 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 6 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 7 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 8 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 16 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 32 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 64 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 128 1 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 256 1 1 most_recent true 200000 201000 + +#run_job 3.8b true hybrid 0 2 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 4 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 8 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 16 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 32 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 64 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 128 1 most_recent true 200000 201000 +#run_job 3.8b true hybrid 0 256 1 most_recent true 200000 201000 + +#run_job 3.8b true hybrid 0 1 2 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 4 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 8 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 16 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 32 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 64 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 128 max true 200000 201000 +#run_job 3.8b true hybrid 0 1 256 max true 200000 201000 + +#run_job 3.8b true hybrid 0 1 1 most_recent false 200000 201000 + +#run_job 13b true hybrid 0 1 1 most_recent true 239000 249000 diff --git a/pretrain/scripts/fp8-behavior-check/sbatch_13b.sh b/pretrain/scripts/fp8-behavior-check/sbatch_13b.sh new file mode 100644 index 0000000..61f34e4 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/sbatch_13b.sh @@ -0,0 +1,70 @@ +#!/bin/bash +#SBATCH --job-name=0031_train_13b +#SBATCH --partition={partition} +#SBATCH --nodes=8 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +# PLEASE run this script from the root of the experiment directory. + + +set -eu -o pipefail + +if [ $# -ne 9 ]; then + >&2 echo "Usage $0 ENABLED FORMAT MARGIN INTERVAL AMAX_HIST_LEN AMAX_ALGO WGRAD ITER STOP" + exit 1 +fi + +FP8_ENABLED=$1; shift +FP8_FORMAT=$1; shift +FP8_MARGIN=$1; shift +FP8_INTERVAL=$1; shift +FP8_AMAX_HISTORY_LEN=$1; shift +FP8_AMAX_COMPUTE_ALGO=$1; shift +FP8_WGRAD=$1; shift +LOAD_ITER=$1; shift +FORCE_STOP_ITER=$1; shift + +EXPERIMENT_DIR=$(pwd) +ENV_DIR=${EXPERIMENT_DIR}/environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1) +export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000))) + +echo "MASTER_ADDR=${MASTER_ADDR}" + +NUM_NODES=$SLURM_JOB_NUM_NODES +NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1) +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +echo NUM_NODES=$NUM_NODES +echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE +echo NUM_GPUS=$NUM_GPUS + +mpirun \ + -np $NUM_GPUS \ + --npernode $NUM_GPUS_PER_NODE \ + -bind-to none \ + -map-by slot \ + -x EXPERIMENT_DIR=$EXPERIMENT_DIR \ + -x MASTER_ADDR=$MASTER_ADDR \ + -x MASTER_PORT=$MASTER_PORT \ + -x NUM_NODES=$NUM_NODES \ + -x NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE \ + \ + -x FP8_ENABLED=$FP8_ENABLED \ + -x FP8_FORMAT=$FP8_FORMAT \ + -x FP8_MARGIN=$FP8_MARGIN \ + -x FP8_INTERVAL=$FP8_INTERVAL \ + -x FP8_AMAX_HISTORY_LEN=$FP8_AMAX_HISTORY_LEN \ + -x FP8_AMAX_COMPUTE_ALGO=$FP8_AMAX_COMPUTE_ALGO \ + -x FP8_WGRAD=$FP8_WGRAD \ + -x LOAD_ITER=$LOAD_ITER \ + -x FORCE_STOP_ITER=${FORCE_STOP_ITER} \ + \ + bash scripts/pretrain/scripts/fp8-behavior-check/train_13b.sh diff --git a/pretrain/scripts/fp8-behavior-check/train_13b.sh b/pretrain/scripts/fp8-behavior-check/train_13b.sh new file mode 100644 index 0000000..42090cb --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/train_13b.sh @@ -0,0 +1,336 @@ +#!/bin/bash + +set -eu -o pipefail + +# EXPERIMENT_DIR= # sbatch option: /path/to/0031_fp8-behavior-check + +# FP8_ENABLED= # sbatch option: true, false +# FP8_FORMAT= # sbatch option: e4m3, hybrid +# FP8_MARGIN= # sbatch option: 0, 1, ... +# FP8_INTERVAL= # sbatch option: 1, 2, ... +# FP8_AMAX_HISTORY_LEN= # sbatch option: 1, 2, ... +# FP8_AMAX_COMPUTE_ALGO= # sbatch option: most_recent, max +# FP8_WGRAD= # sbatch option: true, false +# LOAD_ITER= # sbatch option: 1000 +# FORCE_STOP_ITER= # sbatch option: 2000 + +PARAM_SIZE=13b + +FP8_OPTIONS=() + +if ${FP8_ENABLED}; then + FP8_OPTIONS+=( + --fp8-format ${FP8_FORMAT} + --fp8-margin ${FP8_MARGIN} + --fp8-interval ${FP8_INTERVAL} + --fp8-amax-history-len ${FP8_AMAX_HISTORY_LEN} + --fp8-amax-compute-algo ${FP8_AMAX_COMPUTE_ALGO} + ) + SAVE_SUFFIX=fp8.${FP8_FORMAT}.m${FP8_MARGIN}.i${FP8_INTERVAL}.h${FP8_AMAX_HISTORY_LEN}.${FP8_AMAX_COMPUTE_ALGO} + + if $(${FP8_WGRAD}); then + SAVE_SUFFIX=${SAVE_SUFFIX}.wgrad + else + FP8_OPTIONS+=(--no-fp8-wgrad) + SAVE_SUFFIX=${SAVE_SUFFIX}.no_wgrad + fi +else + SAVE_SUFFIX=bf16 +fi + +LOAD_ITER_FMT=$(printf %07d ${LOAD_ITER}) +SAVE_REL=contd_${LOAD_ITER_FMT}.${SAVE_SUFFIX} + +if [ ${LOAD_ITER} -eq 0 ]; then + LOAD_REL=${SAVE_REL} +else + LOAD_REL=base_${LOAD_ITER_FMT} +fi + +echo FP8 options: ${FP8_OPTIONS[@]} +echo LOAD_REL: ${LOAD_REL} +echo SAVE_REL: ${SAVE_REL} + + +ENV_DIR=${EXPERIMENT_DIR}/environment +CACHE_DIR=${EXPERIMENT_DIR}/cache/${SAVE_REL} + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/scripts/mpi_variables.sh +source ${ENV_DIR}/venv/bin/activate + +# open file limit +ulimit -n 65536 1048576 + +export LOGLEVEL=INFO +export NCCL_DEBUG=WARN +export NCCL_DEBUG_SUBSYS=WARN +export PYTHONFAULTHANDLER=1 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export CUDA_LAUNCH_BLOCKING=0 +export CUDNN_LOGDEST_DBG=stderr +export CUDNN_LOGERR_DBG=1 + +NUM_GPUS=$((${NUM_NODES} * ${NUM_GPUS_PER_NODE})) + +# model config +HIDDEN_SIZE=5120 +FFN_HIDDEN_SIZE=13824 +NUM_LAYERS=40 +NUM_HEADS=40 +SEQ_LENGTH=4096 + +# distributed settings +TENSOR_PARALLEL_SIZE=2 +PIPELINE_PARALLEL_SIZE=2 +CONTEXT_PARALLEL_SIZE=1 +DATA_PARALLEL_SIZE=$((${NUM_GPUS} / (${TENSOR_PARALLEL_SIZE} * ${PIPELINE_PARALLEL_SIZE}))) + +# training config +MICRO_BATCH_SIZE=2 +GLOBAL_BATCH_SIZE=1024 + +LR=2e-4 +MIN_LR=2e-5 +WEIGHT_DECAY=0.1 +GRAD_CLIP=1 + +# total number of iterations +# 2072488058295 (number of tokens) / 4096 (seq len) / 1024 (batch size) = 494119.65806365 -> 494120 +LR_WARMUP_STEPS=2000 +LR_DECAY_ITERS=492120 +TRAIN_STEPS=$((${LR_WARMUP_STEPS} + ${LR_DECAY_ITERS})) + +# model config +TOKENIZER_MODEL=${ENV_DIR}/src/llm-jp-tokenizer/models/ver3.0/llm-jp-tokenizer-100k.ver3.0b1.model + +CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints +CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT}/${PARAM_SIZE}/${LOAD_REL} +CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT}/${PARAM_SIZE}/${SAVE_REL} + +mkdir -p ${CHECKPOINT_SAVE_DIR} + +# data config +DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 +DATASET_V3_1_DIR=/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1 + +TRAIN_DATA_PATH="" + +# code stack +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14486363187 ${DATASET_DIR}/train/code/stack_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12799385151 ${DATASET_DIR}/train/code/stack_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17282923545 ${DATASET_DIR}/train/code/stack_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8861329235 ${DATASET_DIR}/train/code/stack_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 6713413649 ${DATASET_DIR}/train/code/stack_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8976432285 ${DATASET_DIR}/train/code/stack_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17961273649 ${DATASET_DIR}/train/code/stack_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12016948303 ${DATASET_DIR}/train/code/stack_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14953094719 ${DATASET_DIR}/train/code/stack_0008.jsonl_text_document" + +# ja cc 1 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 23783124862 ${DATASET_DIR}/train/ja/cc-1_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36378129564 ${DATASET_DIR}/train/ja/cc-1_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35477545812 ${DATASET_DIR}/train/ja/cc-1_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35917231868 ${DATASET_DIR}/train/ja/cc-1_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 46203062776 ${DATASET_DIR}/train/ja/cc-1_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40396278536 ${DATASET_DIR}/train/ja/cc-1_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 33444216206 ${DATASET_DIR}/train/ja/cc-1_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 32375495374 ${DATASET_DIR}/train/ja/cc-1_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 36068919622 ${DATASET_DIR}/train/ja/cc-1_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26274952324 ${DATASET_DIR}/train/ja/cc-1_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 24024422756 ${DATASET_DIR}/train/ja/cc-1_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 34590145510 ${DATASET_DIR}/train/ja/cc-1_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29567301906 ${DATASET_DIR}/train/ja/cc-1_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 26690562242 ${DATASET_DIR}/train/ja/cc-1_0013.jsonl_text_document" + +# ja cc 2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 35813749376 ${DATASET_DIR}/train/ja/cc-2_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40034668924 ${DATASET_DIR}/train/ja/cc-2_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 31191828858 ${DATASET_DIR}/train/ja/cc-2_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 25086109508 ${DATASET_DIR}/train/ja/cc-2_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18979589830 ${DATASET_DIR}/train/ja/cc-2_0004.jsonl_text_document" + +# ja cc 3 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 40987803038 ${DATASET_DIR}/train/ja/cc-3_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 41333549162 ${DATASET_DIR}/train/ja/cc-3_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 29810274406 ${DATASET_DIR}/train/ja/cc-3_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 22787733940 ${DATASET_DIR}/train/ja/cc-3_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15544493906 ${DATASET_DIR}/train/ja/cc-3_0004.jsonl_text_document" + +# ja kaken +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1826105478 ${DATASET_DIR}/train/ja/kaken_0000.jsonl_text_document" + +# ja warp html +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1329440698 ${DATASET_DIR}/train/ja/warp-html-01-06_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 1397268214 ${DATASET_DIR}/train/ja/warp-html-07-12_0000.jsonl_text_document" + +# ja warp pdf +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30149711608 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 30023232706 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e00_0001.jsonl_text_document" + +# ja warp pdf 0.2 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15396388677 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13225220331 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12433511477 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14722870558 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14818300138 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14827819309 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13394854115 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14369730518 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14027593174 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14719994730 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 9865165774 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14525215128 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 10835111330 ${DATASET_V3_1_DIR}/train2/ja/warp-pdf-e02_0012.jsonl_text_document" + +# ja wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2563804308 ${DATASET_DIR}/train/ja/wiki_0000.jsonl_text_document" + +# en dolma books +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 5494262694 ${DATASET_DIR}/train/en/dolma-books_0000.jsonl_text_document" + +# en dolma c4 +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17052861266 ${DATASET_DIR}/train/en/dolma-c4_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051260422 ${DATASET_DIR}/train/en/dolma-c4_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17056648148 ${DATASET_DIR}/train/en/dolma-c4_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17057773049 ${DATASET_DIR}/train/en/dolma-c4_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17047888113 ${DATASET_DIR}/train/en/dolma-c4_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17046511755 ${DATASET_DIR}/train/en/dolma-c4_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17058086815 ${DATASET_DIR}/train/en/dolma-c4_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17049490900 ${DATASET_DIR}/train/en/dolma-c4_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17051009552 ${DATASET_DIR}/train/en/dolma-c4_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14932405246 ${DATASET_DIR}/train/en/dolma-c4_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13142696712 ${DATASET_DIR}/train/en/dolma-c4_0010.jsonl_text_document" + +# en dolma cc +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15473522696 ${DATASET_DIR}/train/en/dolma-cc-head_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15767913273 ${DATASET_DIR}/train/en/dolma-cc-head_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16664785078 ${DATASET_DIR}/train/en/dolma-cc-head_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16860035920 ${DATASET_DIR}/train/en/dolma-cc-head_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17197613512 ${DATASET_DIR}/train/en/dolma-cc-head_0004.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16363353173 ${DATASET_DIR}/train/en/dolma-cc-head_0005.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15303692924 ${DATASET_DIR}/train/en/dolma-cc-head_0006.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15766283829 ${DATASET_DIR}/train/en/dolma-cc-head_0007.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 13483997219 ${DATASET_DIR}/train/en/dolma-cc-head_0008.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 12561851173 ${DATASET_DIR}/train/en/dolma-cc-head_0009.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14206017429 ${DATASET_DIR}/train/en/dolma-cc-head_0010.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18455249471 ${DATASET_DIR}/train/en/dolma-cc-head_0011.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18359243399 ${DATASET_DIR}/train/en/dolma-cc-head_0012.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16268609444 ${DATASET_DIR}/train/en/dolma-cc-head_0013.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15209913539 ${DATASET_DIR}/train/en/dolma-cc-head_0014.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15601099503 ${DATASET_DIR}/train/en/dolma-cc-head_0015.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16354139164 ${DATASET_DIR}/train/en/dolma-cc-head_0016.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19563123039 ${DATASET_DIR}/train/en/dolma-cc-head_0017.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17794386584 ${DATASET_DIR}/train/en/dolma-cc-head_0018.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17974377563 ${DATASET_DIR}/train/en/dolma-cc-head_0019.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152181306 ${DATASET_DIR}/train/en/dolma-cc-head_0020.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16841018460 ${DATASET_DIR}/train/en/dolma-cc-head_0021.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15622566364 ${DATASET_DIR}/train/en/dolma-cc-head_0022.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 14998264524 ${DATASET_DIR}/train/en/dolma-cc-head_0023.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19994706100 ${DATASET_DIR}/train/en/dolma-cc-head_0024.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19266785326 ${DATASET_DIR}/train/en/dolma-cc-head_0025.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17797970694 ${DATASET_DIR}/train/en/dolma-cc-head_0026.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18662607705 ${DATASET_DIR}/train/en/dolma-cc-head_0027.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18428148263 ${DATASET_DIR}/train/en/dolma-cc-head_0028.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19152709797 ${DATASET_DIR}/train/en/dolma-cc-head_0029.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 19567672702 ${DATASET_DIR}/train/en/dolma-cc-head_0030.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15453203385 ${DATASET_DIR}/train/en/dolma-cc-head_0031.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16946844380 ${DATASET_DIR}/train/en/dolma-cc-head_0032.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16719501611 ${DATASET_DIR}/train/en/dolma-cc-head_0033.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 16348054343 ${DATASET_DIR}/train/en/dolma-cc-head_0034.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18292316049 ${DATASET_DIR}/train/en/dolma-cc-head_0035.jsonl_text_document" + +# en dolma science paper +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 8089227423 ${DATASET_DIR}/train/en/dolma-pes2o_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 20185217235 ${DATASET_DIR}/train/en/dolma-pes2o_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 18622836173 ${DATASET_DIR}/train/en/dolma-pes2o_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15956491971 ${DATASET_DIR}/train/en/dolma-pes2o_0003.jsonl_text_document" + +# en dolma reddit +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17412289508 ${DATASET_DIR}/train/en/dolma-reddit_0000.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17315996345 ${DATASET_DIR}/train/en/dolma-reddit_0001.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 17095921975 ${DATASET_DIR}/train/en/dolma-reddit_0002.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15808400388 ${DATASET_DIR}/train/en/dolma-reddit_0003.jsonl_text_document" +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 15425532535 ${DATASET_DIR}/train/en/dolma-reddit_0004.jsonl_text_document" + +# en dolma wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 3896965449 ${DATASET_DIR}/train/en/dolma-wiki_0000.jsonl_text_document" + +# en wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 4744259830 ${DATASET_DIR}/train/en/wiki_0000.jsonl_text_document" + +# zh wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 840277331 ${DATASET_DIR}/train/zh/wiki_0000.jsonl_text_document" + +# ko wiki +TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 316296219 ${DATASET_DIR}/train/ko/wiki_0000.jsonl_text_document" + +# job name +WANDB_ENTITY="llm-jp" +WANDB_PROJECT="0031_fp8-behavior-check" +WANDB_JOB=${PARAM_SIZE}_${SAVE_REL} + +# run +export NVTE_FUSED_ATTN=0 +python ${ENV_DIR}/src/Megatron-LM/pretrain_gpt.py \ + --tensor-model-parallel-size ${TENSOR_PARALLEL_SIZE} \ + --pipeline-model-parallel-size ${PIPELINE_PARALLEL_SIZE} \ + --context-parallel-size ${CONTEXT_PARALLEL_SIZE} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers ${NUM_LAYERS} \ + --hidden-size ${HIDDEN_SIZE} \ + --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ + --num-attention-heads ${NUM_HEADS} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_STEPS} \ + --tokenizer-type Llama2Tokenizer \ + --tokenizer-model ${TOKENIZER_MODEL} \ + --load ${CHECKPOINT_LOAD_DIR} \ + --save ${CHECKPOINT_SAVE_DIR} \ + --data-path ${TRAIN_DATA_PATH} \ + --split 1,0,0 \ + --data-cache-path ${CACHE_DIR} \ + --distributed-backend nccl \ + --init-method-std 0.02 \ + --lr ${LR} \ + --min-lr ${MIN_LR} \ + --lr-decay-style cosine \ + --lr-decay-iters ${LR_DECAY_ITERS} \ + --weight-decay ${WEIGHT_DECAY} \ + --clip-grad ${GRAD_CLIP} \ + --lr-warmup-iters ${LR_WARMUP_STEPS} \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --log-interval 1 \ + --eval-interval ${TRAIN_STEPS} \ + --eval-iters 0 \ + --bf16 \ + ${FP8_OPTIONS[@]} \ + --untie-embeddings-and-output-weights \ + --position-embedding-type rope \ + --disable-bias-linear \ + --use-mcore-models \ + --normalization RMSNorm \ + --norm-epsilon 1e-5 \ + --no-masked-softmax-fusion \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --swiglu \ + --use-flash-attn \ + --recompute-activations \ + --recompute-granularity "selective" \ + --attention-softmax-in-fp32 \ + --transformer-impl "transformer_engine" \ + --use-mpi \ + --use-z-loss \ + --log-throughput \ + --wandb-entity ${WANDB_ENTITY} \ + --wandb-project ${WANDB_PROJECT} \ + --wandb-name ${WANDB_JOB} \ + --force-stop-iter ${FORCE_STOP_ITER} From 84598d5bffb5b36e4ee0b76fe9eac8ec03f1baf9 Mon Sep 17 00:00:00 2001 From: odashi Date: Mon, 7 Oct 2024 09:14:37 +0900 Subject: [PATCH 5/6] final script --- .../scripts/fp8-behavior-check/convert_13b.sh | 72 +++++++++++++++++++ .../{convert.sh => convert_3.8b.sh} | 0 .../scripts/fp8-behavior-check/run_convert.sh | 51 ++++++++++--- .../scripts/fp8-behavior-check/run_eval.sh | 23 +++--- .../fp8-behavior-check/run_llm-jp-eval.sh | 52 ++++++++++++++ .../scripts/fp8-behavior-check/run_train.sh | 4 +- .../scripts/fp8-behavior-check/train_13b.sh | 7 +- 7 files changed, 184 insertions(+), 25 deletions(-) create mode 100644 pretrain/scripts/fp8-behavior-check/convert_13b.sh rename pretrain/scripts/fp8-behavior-check/{convert.sh => convert_3.8b.sh} (100%) create mode 100644 pretrain/scripts/fp8-behavior-check/run_llm-jp-eval.sh diff --git a/pretrain/scripts/fp8-behavior-check/convert_13b.sh b/pretrain/scripts/fp8-behavior-check/convert_13b.sh new file mode 100644 index 0000000..af05414 --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/convert_13b.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Model conversion script for converting Megatron format checkpoints into Hugging Face format +# +# This script needs one node on the `gpu` partition of the cluster. +# However, a GPU is necessary to verify CUDA functionality, even though no VRAM will be used. +# +# Usage: +# On a cluster with SLURM: +# Run `sbatch --partition {partition} convert.sh SOURCE_DIR TARGET_DIR` +# On a cluster without SLURM: +# Run `bash convert.sh SOURCE_DIR TARGET_DIR TEMPORAL_DIR > outpus/convert.out 2> outputs/convert.err` +# - SOURCE_DIR: Megatron checkpoint directory including `iter_NNNNNNN` +# - TARGET_DIR: Output directory for the Hugging Face format +# +# Example: +# sbatch convert.sh /data/experiments/{exp-id}/checkpoints/iter_0001000 /data/experiments/{exp-id}/hf_checkpoints/iter_0001000 +# +#SBATCH --job-name=0031_convert +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=200G +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -e + +MEGATRON_CHECKPOINT_DIR=${1%/} +HF_CHECKPOINT_DIR=$2 + +ENV_DIR=environment + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +TOKENIZER_MODEL_DIR=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.0/llm-jp-tokenizer-100k.ver3.0b2 + +TARGET_ITER_DIR=$(basename $MEGATRON_CHECKPOINT_DIR) # iter_NNNNNNN +ITER=$(( 10#$(echo $TARGET_ITER_DIR | sed 's/^iter_//') )) # NNNNNNN (no 0 padding) +echo ITER=$ITER + +if [[ -z "$ITER" || ! "$ITER" =~ ^[0-9]+$ ]]; then # check if directory is valid + >&2 echo "Error: ITER=$ITER is not a valid number. Exiting." + exit 1 +fi + +# Create a unique temporal working directory to avoid affecting the original directory and +# to allow multiple runs to execute simultaneously. +TMP_DIR=$(mktemp -d "${HOME}/ckpt_convert.XXXXXXXX") +>&2 echo TMP_DIR=$TMP_DIR +ln -s $(readlink -f $MEGATRON_CHECKPOINT_DIR) ${TMP_DIR}/${TARGET_ITER_DIR} +echo $ITER > "${TMP_DIR}/latest_checkpointed_iteration.txt" + +echo "Converting $MEGATRON_CHECKPOINT_DIR" + +python ${ENV_DIR}/src/Megatron-LM/tools/checkpoint/convert.py \ + --model-type GPT \ + --loader mcore \ + --saver llama2_hf \ + --load-dir $TMP_DIR \ + --save-dir $HF_CHECKPOINT_DIR \ + --hf-tokenizer-path $TOKENIZER_MODEL_DIR \ + --save-dtype bfloat16 \ + --loader-transformer-impl "transformer_engine" \ + --megatron-path ${ENV_DIR}/src/Megatron-LM + +cp ${TOKENIZER_MODEL_DIR}/* $HF_CHECKPOINT_DIR + +rm -r $TMP_DIR +echo "Done" diff --git a/pretrain/scripts/fp8-behavior-check/convert.sh b/pretrain/scripts/fp8-behavior-check/convert_3.8b.sh similarity index 100% rename from pretrain/scripts/fp8-behavior-check/convert.sh rename to pretrain/scripts/fp8-behavior-check/convert_3.8b.sh diff --git a/pretrain/scripts/fp8-behavior-check/run_convert.sh b/pretrain/scripts/fp8-behavior-check/run_convert.sh index d779151..2566124 100644 --- a/pretrain/scripts/fp8-behavior-check/run_convert.sh +++ b/pretrain/scripts/fp8-behavior-check/run_convert.sh @@ -1,13 +1,44 @@ #!/bin/bash -mkdir -p checkpoints_hf - -for d in $(ls checkpoints/3.8b); do - echo $d - sbatch \ - --partition=gpu-small \ - scripts/pretrain/scripts/fp8-behavior-check/convert.sh \ - checkpoints/3.8b/$d \ - checkpoints_hf/3.8b/$d -done +mkdir -p checkpoints_hf/{3.8b,13b} + +# 3.8B + +#for d in $(ls checkpoints/3.8b); do +# echo $d +# sbatch \ +# --partition=gpu-small \ +# scripts/pretrain/scripts/fp8-behavior-check/convert_3.8b.sh \ +# checkpoints/3.8b/$d \ +# checkpoints_hf/3.8b/$d +#done + +# 13B + +CONFIGS=( + contd_0000000.fp8.hybrid.m0.i1.h1.most_recent.wgrad + contd_0239000.fp8.hybrid.m0.i1.h1.most_recent.wgrad +) +SRC_ROOT=/home/shared/experiments/0031_fp8-behavior/checkpoints/13b +DEST_ROOT=/home/shared/experiments/0031_fp8-behavior/checkpoints_hf/13b + +for c in ${CONFIGS[@]}; do + s=${SRC_ROOT}/$c + d=${DEST_ROOT}/$c + + for i in `ls $s | egrep '^iter_.{7}$'`; do + if [ -e $d/$i ]; then + echo "Exists: $s/$i" + continue + fi + + echo "Converting: $s/$i" + sbatch \ + --job-name=0031_convert \ + --partition=gpu-small-lp \ + scripts/pretrain/scripts/fp8-behavior-check/convert_13b.sh \ + $s/$i \ + $d/$i + done +done diff --git a/pretrain/scripts/fp8-behavior-check/run_eval.sh b/pretrain/scripts/fp8-behavior-check/run_eval.sh index 6eb183c..71cd24b 100644 --- a/pretrain/scripts/fp8-behavior-check/run_eval.sh +++ b/pretrain/scripts/fp8-behavior-check/run_eval.sh @@ -1,19 +1,16 @@ #!/bin/bash -CHECKPOINTS_DIR=checkpoints_hf/3.8b - -mkdir -p processed - -for d in $(ls ${CHECKPOINTS_DIR}); do - if [[ -f processed/$d ]]; then - echo "$d: already processed" +for cfg_file in $(find checkpoints_hf -name config.json | sort); do + cfg=$(dirname $cfg_file | sed 's/checkpoints_hf\///') + if [ -e processed/$cfg ]; then + echo "Already processed: $cfg" continue fi + sbatch \ - --partition=gpu-small \ - --priority=1 \ - eval_environment/run_llm-jp-eval.sh ${CHECKPOINTS_DIR}/$d $d \ - && touch processed/$d \ - && echo "$d: queued" -done + --partition=gpu-small-lp \ + scripts/pretrain/scripts/fp8-behavior-check/run_llm-jp-eval.sh checkpoints_hf/$cfg $cfg + mkdir -p $(dirname processed/$cfg) && touch processed/$cfg + echo "Started: $cfg" +done diff --git a/pretrain/scripts/fp8-behavior-check/run_llm-jp-eval.sh b/pretrain/scripts/fp8-behavior-check/run_llm-jp-eval.sh new file mode 100644 index 0000000..7bc45cc --- /dev/null +++ b/pretrain/scripts/fp8-behavior-check/run_llm-jp-eval.sh @@ -0,0 +1,52 @@ +#!/bin/bash +#SBATCH --job-name=0031_eval +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --mem=200G +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -eu -o pipefail + +# Open file limit +ulimit -n 65536 1048576 + +EXPERIMENT_DIR=eval_environment + +ENV_DIR=${EXPERIMENT_DIR}/environment +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +# Arguments +MODEL=$1 +WANDB_RUN_NAME=$2 + +# Semi-fixed vars +CONFIG_TEMPLATE=${EXPERIMENT_DIR}/resources/config_base.yaml +TOKENIZER=$MODEL +WANDB_ENTITY=llm-jp-eval +WANDB_PROJECT=0031_fp8-behavior-check + +# Fixed vars +CONFIG_DIR=${ENV_DIR}/src/llm-jp-eval/configs +SCRIPT_PATH=${ENV_DIR}/src/llm-jp-eval/scripts/evaluate_llm.py +DATASET_DIR=${ENV_DIR}/data/llm-jp-eval/${LLM_JP_EVAL_TAG}/evaluation/dev + +# Config settings +NEW_CONFIG=${CONFIG_DIR}/config.${WANDB_PROJECT}.$(echo ${WANDB_RUN_NAME} | tr '/' '_').yaml +REPLACE_VARS=("MODEL" "TOKENIZER" "DATASET_DIR" "WANDB_ENTITY" "WANDB_PROJECT" "WANDB_RUN_NAME") + +# Create a new config file to save the config file of each run +cp $CONFIG_TEMPLATE $NEW_CONFIG + +# Replace variables +for VAR in "${REPLACE_VARS[@]}"; do + VALUE=$(eval echo \${$VAR}) + sed -i "s|<<${VAR}>>|${VALUE}|g" $NEW_CONFIG +done + +# Run llm-jp-eval +python $SCRIPT_PATH -cn $(basename $NEW_CONFIG) + +echo "Done" diff --git a/pretrain/scripts/fp8-behavior-check/run_train.sh b/pretrain/scripts/fp8-behavior-check/run_train.sh index c366e98..3364068 100644 --- a/pretrain/scripts/fp8-behavior-check/run_train.sh +++ b/pretrain/scripts/fp8-behavior-check/run_train.sh @@ -5,6 +5,7 @@ run_job() { PARAM_SIZE=$1; shift sbatch \ --partition=gpu-small \ + --nodes=8 \ scripts/pretrain/scripts/fp8-behavior-check/sbatch_${PARAM_SIZE}.sh \ $@ } @@ -59,4 +60,5 @@ run_job() { #run_job 3.8b true hybrid 0 1 1 most_recent false 200000 201000 -#run_job 13b true hybrid 0 1 1 most_recent true 239000 249000 +run_job 13b true hybrid 0 1 1 most_recent true 0 50000 +#run_job 13b true hybrid 0 1 1 most_recent true 239000 289000 diff --git a/pretrain/scripts/fp8-behavior-check/train_13b.sh b/pretrain/scripts/fp8-behavior-check/train_13b.sh index 42090cb..20e1a8b 100644 --- a/pretrain/scripts/fp8-behavior-check/train_13b.sh +++ b/pretrain/scripts/fp8-behavior-check/train_13b.sh @@ -108,7 +108,12 @@ CHECKPOINT_ROOT=${EXPERIMENT_DIR}/checkpoints CHECKPOINT_LOAD_DIR=${CHECKPOINT_ROOT}/${PARAM_SIZE}/${LOAD_REL} CHECKPOINT_SAVE_DIR=${CHECKPOINT_ROOT}/${PARAM_SIZE}/${SAVE_REL} -mkdir -p ${CHECKPOINT_SAVE_DIR} +if [ -e $CHECKPOINT_SAVE_DIR ]; then + # Continue previous run + CHECKPOINT_LOAD_DIR=${CHECKPOINT_SAVE_DIR} +else + mkdir -p ${CHECKPOINT_SAVE_DIR} +fi # data config DATASET_DIR=/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0 From 885446ed2321a43b54d8d736e3f9188dc64ba08f Mon Sep 17 00:00:00 2001 From: odashi Date: Mon, 7 Oct 2024 09:23:23 +0900 Subject: [PATCH 6/6] fix --- pretrain/scripts/fp8-behavior-check/run_train.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain/scripts/fp8-behavior-check/run_train.sh b/pretrain/scripts/fp8-behavior-check/run_train.sh index 3364068..44f20b3 100644 --- a/pretrain/scripts/fp8-behavior-check/run_train.sh +++ b/pretrain/scripts/fp8-behavior-check/run_train.sh @@ -60,5 +60,5 @@ run_job() { #run_job 3.8b true hybrid 0 1 1 most_recent false 200000 201000 -run_job 13b true hybrid 0 1 1 most_recent true 0 50000 +#run_job 13b true hybrid 0 1 1 most_recent true 0 50000 #run_job 13b true hybrid 0 1 1 most_recent true 239000 289000