Skip to content

Commit

Permalink
Merge pull request #4 from okoge-kaz/feature/phi-3
Browse files Browse the repository at this point in the history
Support Phi-3, Yi-1.5, Codestral
  • Loading branch information
okoge-kaz authored Jun 1, 2024
2 parents 91eaba0 + 1d741c4 commit 150c3bf
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 19 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"anyprecision",
"autocast",
"bettertransformer",
"Codestral",
"colour",
"Concatenator",
"detokenize",
Expand Down
32 changes: 30 additions & 2 deletions scripts/abci/phi3/phi3-14b.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#$ -l rt_AF=4
#$ -l h_rt=5:0:00:00
#$ -l h_rt=10:00:00:00
#$ -j y
#$ -o outputs/phi-3/
#$ -cwd
Expand Down Expand Up @@ -73,8 +73,36 @@ mkdir -p ${CHECKPOINT_SAVE_DIR}

DATA_PATH=""

# Swallow v1
DATA_PATH="${DATA_PATH} 9108171060 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_0_text_document"
DATA_PATH="${DATA_PATH} 9017389663 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_1_text_document"
DATA_PATH="${DATA_PATH} 10781891782 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_2_text_document"
DATA_PATH="${DATA_PATH} 14229527811 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_3_text_document"
DATA_PATH="${DATA_PATH} 33251122086 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/split_4_text_document"

# ja wikipedia
DATA_PATH="${DATA_PATH} 2657688677 /bb/llm/gaf51275/binarized/phi-3-default/ja_wiki_text_document"
DATA_PATH="${DATA_PATH} 2659052072 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/ja_wiki_merged_text_document"

# parallel corpus
DATA_PATH="${DATA_PATH} 1265915426 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/default_plain_text_format_text_document"

# en wikipedia
DATA_PATH="${DATA_PATH} 1400935123 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/en_wiki_merged_train_text_document"

# en refinedweb
DATA_PATH="${DATA_PATH} 1400935123 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/lumi_en_falcon_merge_text_document"

# en cosmopedia
DATA_PATH="${DATA_PATH} 1394911660 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_automathtext_train_text_document"
DATA_PATH="${DATA_PATH} 22852028 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_khanacademy_train_text_document"
DATA_PATH="${DATA_PATH} 115215400 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_openstax_train_text_document"
DATA_PATH="${DATA_PATH} 1120661316 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_stanford_train_text_document"
DATA_PATH="${DATA_PATH} 3131907229 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_stories_train_text_document"
DATA_PATH="${DATA_PATH} 195599284 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/cosmopedia_wikihow_train_text_document"

# code algebraic stack
DATA_PATH="${DATA_PATH} 10903912936 /bb/llm/gaf51275/datasets/Phi-3_original_transformers-4.40.1/algebraic-stack_text_document"


# job name
JOB_NAME="Phi-3-ABCI-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-${SEQ_LENGTH}s-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
Expand Down
151 changes: 151 additions & 0 deletions scripts/gcp/codestral-22b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/bin/bash
#SBATCH --job-name=codestral
#SBATCH --partition=a3
#SBATCH --exclusive
#SBATCH --nodes 2
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/codestral/%x-%j.out
#SBATCH --error=outputs/codestral/%x-%j.out

set -e

# module load
module load cuda/12.1
module load cudnn/8.9.7
module load hpcx/2.17.1

# open file limit
ulimit -n 65536 1048576

# python virtualenv
source .env/bin/activate

# Important TCPX environment variables
UDS_PATH="/run/tcpx-${SLURM_JOB_ID}"

# Only use TCPX for multi-node jobs.
[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no

# Only use TCPX for multi-node jobs.
if [[ ${USE_TCPX} = "yes" ]]; then
# Set up NCCL Environment variables
export NCCL_NET=GPUDirectTCPX_v7
# These network interfaces use Ubuntu's consistent naming scheme. See
# https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html
export NCCL_SOCKET_IFNAME=enp0s12
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=1
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_P2P_PXN_LEVEL=0
export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"

export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
else
unset NCCL_NET
fi

# distributed settings
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

# hostfile
export NUM_GPU_PER_NODE=8
NODE_TYPE="H100"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))

# training config
SEQ_LENGTH=4096
DATA_PARALLEL_SIZE=$NUM_GPUS

MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=1024
TRAIN_STEPS=25000

# optimizer config
LR=2.5E-5
MIN_LR=2.5E-6
LR_WARMUP_STEPS=1000
LR_DECAY_STEPS=25000
WEIGHT_DECAY=0.1
GRAD_CLIP=1
# model config
TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model
CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1
CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Codestral-22B-v0.1

mkdir -p ${CHECKPOINT_SAVE_DIR}

# data config
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1

TRAIN_DATA_PATH=""

# ja wiki
TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 2741303196 ${DATASET_DIR}/ja_wiki_text_document"

# job name
JOB_NAME="Codestral-22B-v0.1-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"

# run
mpirun -np $NUM_GPUS \
--npernode $NUM_GPU_PER_NODE \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-bind-to none \
-x LD_LIBRARY_PATH \
-x PATH \
python examples/finetuning.py \
--seq-length ${SEQ_LENGTH} \
--sliding-window-size ${SEQ_LENGTH} \
--micro-batch-size ${MICRO_BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--train-iters ${TRAIN_STEPS} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--data-path ${TRAIN_DATA_PATH} \
--split 949,50,1 \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--lr-warmup-iters ${LR_WARMUP_STEPS} \
--lr-decay-iters ${LR_DECAY_STEPS} \
--weight-decay ${WEIGHT_DECAY} \
--grad-clip-norm ${GRAD_CLIP} \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-5 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--bf16 \
--mixed-precision \
--base-model ${CHECKPOINT_DIR} \
--save ${CHECKPOINT_SAVE_DIR} \
--load ${CHECKPOINT_SAVE_DIR} \
--low-cpu-fsdp \
--sharding-strategy FULL_SHARD \
--checkpoint-type LOCAL_STATE_DICT \
--fsdp-activation-checkpointing \
--use-mpi \
--wandb-entity "okoge" \
--wandb-project "llm-recipes" \
--wandb-name "${JOB_NAME}"
18 changes: 18 additions & 0 deletions scripts/gcp/tokenize/codestral-ja-wiki.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# swich virtual env
source .env/bin/activate

DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples
OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/Codestral-22B-v0.1

mkdir -p ${OUTPUT_DIR}

# tokenize japanese wikipedia
python megatron_lm/tools/preprocess_data.py \
--input ${DATASET_DIR}/ja_wiki.jsonl \
--output-prefix ${OUTPUT_DIR}/ja_wiki \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Codestral-22B-v0.1/tokenizer.model \
--append-eod \
--workers 64
18 changes: 18 additions & 0 deletions scripts/gcp/tokenize/yi-1.5-ja-wiki.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# swich virtual env
source .env/bin/activate

DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/samples
OUTPUT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5

mkdir -p ${OUTPUT_DIR}

# tokenize japanese wikipedia
python megatron_lm/tools/preprocess_data.py \
--input ${DATASET_DIR}/ja_wiki.jsonl \
--output-prefix ${OUTPUT_DIR}/ja_wiki \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model /home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model \
--append-eod \
--workers 64
151 changes: 151 additions & 0 deletions scripts/gcp/yi-1.5-9b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/bin/bash
#SBATCH --job-name=yi-1.5-9b
#SBATCH --partition=a3
#SBATCH --exclusive
#SBATCH --nodes 2
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/yi-1.5-9b/%x-%j.out
#SBATCH --error=outputs/yi-1.5-9b/%x-%j.out

set -e

# module load
module load cuda/12.1
module load cudnn/8.9.7
module load hpcx/2.17.1

# open file limit
ulimit -n 65536 1048576

# python virtualenv
source .env/bin/activate

# Important TCPX environment variables
UDS_PATH="/run/tcpx-${SLURM_JOB_ID}"

# Only use TCPX for multi-node jobs.
[[ "${SLURM_JOB_NUM_NODES}" -gt 1 ]] && export USE_TCPX=yes || export USE_TCPX=no

# Only use TCPX for multi-node jobs.
if [[ ${USE_TCPX} = "yes" ]]; then
# Set up NCCL Environment variables
export NCCL_NET=GPUDirectTCPX_v7
# These network interfaces use Ubuntu's consistent naming scheme. See
# https://manpages.ubuntu.com/manpages/focal/man7/systemd.net-naming-scheme.7.html
export NCCL_SOCKET_IFNAME=enp0s12
export NCCL_GPUDIRECTTCPX_CTRL_DEV=enp0s12
export NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=enp6s0,enp12s0,enp134s0,enp140s0
export NCCL_CROSS_NIC=0
export NCCL_ALGO=Ring
export NCCL_PROTO=Simple
export NCCL_NSOCKS_PERTHREAD=4
export NCCL_SOCKET_NTHREADS=1
export NCCL_DYNAMIC_CHUNK_SIZE=524288
export NCCL_P2P_NET_CHUNKSIZE=524288
export NCCL_P2P_PCI_CHUNKSIZE=524288
export NCCL_P2P_NVL_CHUNKSIZE=1048576
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_NET_GDR_LEVEL=PIX
export NCCL_P2P_PXN_LEVEL=0
export NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX=${UDS_PATH}
export NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=500000
export NCCL_GPUDIRECTTCPX_TX_BINDINGS="enp6s0:8-21,112-125;enp12s0:8-21,112-125;enp134s0:60-73,164-177;enp140s0:60-73,164-177"
export NCCL_GPUDIRECTTCPX_RX_BINDINGS="enp6s0:22-35,126-139;enp12s0:22-35,126-139;enp134s0:74-87,178-191;enp140s0:74-87,178-191"

export LD_LIBRARY_PATH=/var/lib/tcpx/lib64:${LD_LIBRARY_PATH}
else
unset NCCL_NET
fi

# distributed settings
export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + ($SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

# hostfile
export NUM_GPU_PER_NODE=8
NODE_TYPE="H100"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))

# training config
SEQ_LENGTH=4096
DATA_PARALLEL_SIZE=$NUM_GPUS

MICRO_BATCH_SIZE=8
GLOBAL_BATCH_SIZE=1024
TRAIN_STEPS=25000

# optimizer config
LR=2.5E-5
MIN_LR=2.5E-6
LR_WARMUP_STEPS=1000
LR_DECAY_STEPS=25000
WEIGHT_DECAY=0.1
GRAD_CLIP=1
# model config
TOKENIZER_MODEL=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B/tokenizer.model
CHECKPOINT_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/hf_checkpoints/Yi-1.5-9B
CHECKPOINT_SAVE_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/checkpoints/Yi-1.5-9B

mkdir -p ${CHECKPOINT_SAVE_DIR}

# data config
DATASET_DIR=/home/ext_kazuki_fujii_rio_gsic_titech/datasets/debug/yi-1.5

DATA_PATH=""

# ja wiki
DATA_PATH="${DATA_PATH} 2990167836 ${DATASET_DIR}/ja_wiki_text_document"

# job name
JOB_NAME="Yi-1.5-9B-gcp-${NODE_TYPE}-${NUM_NODES}node-${NUM_GPUS}gpu-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WARMUP=${LR_WARMUP_STEPS}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"

# run
mpirun -np $NUM_GPUS \
--npernode $NUM_GPU_PER_NODE \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
-bind-to none \
-x LD_LIBRARY_PATH \
-x PATH \
python examples/finetuning.py \
--seq-length ${SEQ_LENGTH} \
--sliding-window-size ${SEQ_LENGTH} \
--micro-batch-size ${MICRO_BATCH_SIZE} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--train-iters ${TRAIN_STEPS} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--data-path ${DATA_PATH} \
--split 949,50,1 \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--lr-warmup-iters ${LR_WARMUP_STEPS} \
--lr-decay-iters ${LR_DECAY_STEPS} \
--weight-decay ${WEIGHT_DECAY} \
--grad-clip-norm ${GRAD_CLIP} \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-5 \
--save-interval 500 \
--eval-interval 100 \
--eval-iters 10 \
--bf16 \
--mixed-precision \
--base-model ${CHECKPOINT_DIR} \
--save ${CHECKPOINT_SAVE_DIR} \
--load ${CHECKPOINT_SAVE_DIR} \
--low-cpu-fsdp \
--sharding-strategy FULL_SHARD \
--checkpoint-type LOCAL_STATE_DICT \
--fsdp-activation-checkpointing \
--use-mpi \
--wandb-entity "okoge" \
--wandb-project "llm-recipes" \
--wandb-name "${JOB_NAME}"
Loading

0 comments on commit 150c3bf

Please sign in to comment.