Skip to content

Commit 890d9dd

Browse files
authored
Merge pull request #16 from okoge-kaz/feature/v1.0.1
version v1.0.1
2 parents e17075a + 3a9da08 commit 890d9dd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+3556
-143
lines changed

.vscode/settings.json

+2
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
"pbar",
4848
"peft",
4949
"plamo",
50+
"pretraining",
5051
"probs",
5152
"psutil",
5253
"pubmed",
@@ -57,6 +58,7 @@
5758
"stabilityai",
5859
"stablelm",
5960
"stockmark",
61+
"tensorboard",
6062
"tflops",
6163
"tobytes",
6264
"Xformer"

scripts/abci/instruction/Llama-3-8B/Llama-3-8B-instruct-v0.2.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/bin/bash
2-
#$ -l rt_AF=2
3-
#$ -l h_rt=0:01:00:00
2+
#$ -l rt_AF=1
3+
#$ -l h_rt=0:08:00:00
44
#$ -j y
55
#$ -o outputs/instruction/Llama-3-8B/
66
#$ -cwd
@@ -82,7 +82,7 @@ mpirun -np $NUM_GPUS \
8282
-x MASTER_ADDR=$MASTER_ADDR \
8383
-x MASTER_PORT=$MASTER_PORT \
8484
-bind-to none \
85-
-x PATH \
85+
-x NCCL_IB_TIMEOUT=22 \
8686
-x LD_LIBRARY_PATH \
8787
-x PATH \
8888
python examples/finetuning.py \
@@ -102,7 +102,7 @@ mpirun -np $NUM_GPUS \
102102
--adam-beta1 0.9 \
103103
--adam-beta2 0.95 \
104104
--adam-eps 1e-8 \
105-
--save-interval 500 \
105+
--save-interval 10 \
106106
--eval-interval 500 \
107107
--eval-iters 10 \
108108
--bf16 \
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/bin/bash
2+
#$ -l rt_AF=8
3+
#$ -l h_rt=1:00:00:00
4+
#$ -j y
5+
#$ -o outputs/instruction/Llama-3.1-8B/
6+
#$ -cwd
7+
8+
# module load
9+
source /etc/profile.d/modules.sh
10+
module use /bb/llm/gaf51275/modules/modulefiles
11+
12+
module load cuda/12.1/12.1.1
13+
module load cudnn/cuda-12.1/9.0.0
14+
module load nccl/2.20.5
15+
module load hpcx/2.12
16+
module load gcc/11.4.0
17+
18+
# swich virtual env
19+
source .env/bin/activate
20+
21+
# distributed settings
22+
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
23+
export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
24+
25+
echo "MASTER_ADDR=${MASTER_ADDR}"
26+
27+
# hostfile
28+
29+
if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
30+
export NUM_GPU_PER_NODE=4
31+
NODE_TYPE="v100"
32+
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
33+
export NUM_GPU_PER_NODE=8
34+
NODE_TYPE="a100"
35+
else
36+
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
37+
fi
38+
39+
NUM_NODES=$NHOSTS
40+
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
41+
42+
mkdir -p ./hostfile
43+
44+
HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
45+
while read -r line; do
46+
echo "${line} slots=${NUM_GPU_PER_NODE}"
47+
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
48+
49+
# training config
50+
SEQ_LENGTH=8192
51+
DATA_PARALLEL_SIZE=$NUM_GPUS
52+
53+
MICRO_BATCH_SIZE=1
54+
GLOBAL_BATCH_SIZE=256
55+
56+
# optimizer config
57+
LR=1e-5
58+
MIN_LR=1e-6
59+
WEIGHT_DECAY=0.1
60+
GRAD_CLIP=1
61+
62+
# checkpoint
63+
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
64+
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
65+
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-1/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
66+
67+
mkdir -p ${CHECKPOINT_SAVE_DIR}
68+
69+
# dataset
70+
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
71+
72+
TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
73+
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
74+
75+
# job name
76+
JOB_NAME="Llama-3.1-8B-instruct-exp-1.1-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
77+
78+
# run
79+
mpirun -np $NUM_GPUS \
80+
--npernode $NUM_GPU_PER_NODE \
81+
-hostfile $HOSTFILE_NAME \
82+
-x MASTER_ADDR=$MASTER_ADDR \
83+
-x MASTER_PORT=$MASTER_PORT \
84+
-bind-to none \
85+
-x NCCL_IB_TIMEOUT=22 \
86+
-x LD_LIBRARY_PATH \
87+
-x PATH \
88+
python examples/finetuning.py \
89+
--seq-length ${SEQ_LENGTH} \
90+
--micro-batch-size ${MICRO_BATCH_SIZE} \
91+
--global-batch-size ${GLOBAL_BATCH_SIZE} \
92+
--hf-transformer-model-dir ${TOKENIZER_DIR} \
93+
--instruction-train-data-path ${TRAIN_DATA_PATH} \
94+
--instruction-valid-data-path ${VALID_DATA_PATH} \
95+
--epoch 2 \
96+
--lr ${LR} \
97+
--min-lr ${MIN_LR} \
98+
--lr-decay-style cosine \
99+
--weight-decay ${WEIGHT_DECAY} \
100+
--grad-clip-norm ${GRAD_CLIP} \
101+
--optimizer adam \
102+
--adam-beta1 0.9 \
103+
--adam-beta2 0.95 \
104+
--adam-eps 1e-8 \
105+
--save-interval 50000 \
106+
--eval-interval 500 \
107+
--eval-iters 10 \
108+
--bf16 \
109+
--mixed-precision \
110+
--base-model ${CHECKPOINT_DIR} \
111+
--save ${CHECKPOINT_SAVE_DIR} \
112+
--load ${CHECKPOINT_SAVE_DIR} \
113+
--low-cpu-fsdp \
114+
--sharding-strategy FULL_SHARD \
115+
--checkpoint-type LOCAL_STATE_DICT \
116+
--fsdp-activation-checkpointing \
117+
--instruction-tuning \
118+
--save-sampler-state \
119+
--use-mpi \
120+
--wandb-entity "prj-jalm" \
121+
--wandb-project "Llama-3.1-8B-Instruct" \
122+
--wandb-name "${JOB_NAME}"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#!/bin/bash
2+
#$ -l rt_AF=8
3+
#$ -l h_rt=1:00:00:00
4+
#$ -j y
5+
#$ -o outputs/instruction/Llama-3.1-8B/
6+
#$ -cwd
7+
8+
# module load
9+
source /etc/profile.d/modules.sh
10+
module use /bb/llm/gaf51275/modules/modulefiles
11+
12+
module load cuda/12.1/12.1.1
13+
module load cudnn/cuda-12.1/9.0.0
14+
module load nccl/2.20.5
15+
module load hpcx/2.12
16+
module load gcc/11.4.0
17+
18+
# swich virtual env
19+
source .env/bin/activate
20+
21+
# distributed settings
22+
export MASTER_ADDR=$(/usr/sbin/ip a show dev bond0 | grep 'inet ' | awk '{ print $2 }' | cut -d "/" -f 1)
23+
export MASTER_PORT=$((10000 + ($JOB_ID % 50000)))
24+
25+
echo "MASTER_ADDR=${MASTER_ADDR}"
26+
27+
# hostfile
28+
29+
if [[ "$SGE_RESOURCE_TYPE" == "rt_F" ]]; then
30+
export NUM_GPU_PER_NODE=4
31+
NODE_TYPE="v100"
32+
elif [[ "$SGE_RESOURCE_TYPE" == "rt_AF" ]]; then
33+
export NUM_GPU_PER_NODE=8
34+
NODE_TYPE="a100"
35+
else
36+
echo "Unrecognized SGE_RESOURCE_TYPE: $SGE_RESOURCE_TYPE"
37+
fi
38+
39+
NUM_NODES=$NHOSTS
40+
NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
41+
42+
mkdir -p ./hostfile
43+
44+
HOSTFILE_NAME=./hostfile/hostfile_${JOB_ID}
45+
while read -r line; do
46+
echo "${line} slots=${NUM_GPU_PER_NODE}"
47+
done <"$SGE_JOB_HOSTLIST" >"$HOSTFILE_NAME"
48+
49+
# training config
50+
SEQ_LENGTH=8192
51+
DATA_PARALLEL_SIZE=$NUM_GPUS
52+
53+
MICRO_BATCH_SIZE=1
54+
GLOBAL_BATCH_SIZE=256
55+
56+
# optimizer config
57+
LR=2e-6
58+
MIN_LR=2e-7
59+
WEIGHT_DECAY=0.1
60+
GRAD_CLIP=1
61+
62+
# checkpoint
63+
TOKENIZER_DIR=/groups/gag51395/hf-checkpoints/Meta-Llama-3-8B-Instruct
64+
CHECKPOINT_DIR=/bb/llm/gaf51275/2024/checkpoints/megatron-to-hf/Llama-3.1-8b/tp4-pp2-ct1-LR2.5E-5-MINLR2.5E-6-WD0.1/iter_0027500
65+
CHECKPOINT_SAVE_DIR="/bb/llm/gaf51275/2024/checkpoints/Llama-3.1-8B-Instruct/exp1-2/LR_${LR}_MINLR_${MIN_LR}_WD_${WEIGHT_DECAY}_GC_${GRAD_CLIP}"
66+
67+
mkdir -p ${CHECKPOINT_SAVE_DIR}
68+
69+
# dataset
70+
DATASET_DIR=/bb/llm/gaf51275/datasets/raw/instruct/training/exp1-1
71+
72+
TRAIN_DATA_PATH=${DATASET_DIR}/train.jsonl
73+
VALID_DATA_PATH=${DATASET_DIR}/train.jsonl
74+
75+
# job name
76+
JOB_NAME="Llama-3.1-8B-instruct-exp-1.2-BS=${GLOBAL_BATCH_SIZE}-LR=${LR}-MINLR=${MIN_LR}-WD=${WEIGHT_DECAY}-GC=${GRAD_CLIP}"
77+
78+
# run
79+
mpirun -np $NUM_GPUS \
80+
--npernode $NUM_GPU_PER_NODE \
81+
-hostfile $HOSTFILE_NAME \
82+
-x MASTER_ADDR=$MASTER_ADDR \
83+
-x MASTER_PORT=$MASTER_PORT \
84+
-bind-to none \
85+
-x NCCL_IB_TIMEOUT=22 \
86+
-x LD_LIBRARY_PATH \
87+
-x PATH \
88+
python examples/finetuning.py \
89+
--seq-length ${SEQ_LENGTH} \
90+
--micro-batch-size ${MICRO_BATCH_SIZE} \
91+
--global-batch-size ${GLOBAL_BATCH_SIZE} \
92+
--hf-transformer-model-dir ${TOKENIZER_DIR} \
93+
--instruction-train-data-path ${TRAIN_DATA_PATH} \
94+
--instruction-valid-data-path ${VALID_DATA_PATH} \
95+
--epoch 2 \
96+
--lr ${LR} \
97+
--min-lr ${MIN_LR} \
98+
--lr-decay-style cosine \
99+
--weight-decay ${WEIGHT_DECAY} \
100+
--grad-clip-norm ${GRAD_CLIP} \
101+
--optimizer adam \
102+
--adam-beta1 0.9 \
103+
--adam-beta2 0.95 \
104+
--adam-eps 1e-8 \
105+
--save-interval 50000 \
106+
--eval-interval 500 \
107+
--eval-iters 10 \
108+
--bf16 \
109+
--mixed-precision \
110+
--base-model ${CHECKPOINT_DIR} \
111+
--save ${CHECKPOINT_SAVE_DIR} \
112+
--load ${CHECKPOINT_SAVE_DIR} \
113+
--low-cpu-fsdp \
114+
--sharding-strategy FULL_SHARD \
115+
--checkpoint-type LOCAL_STATE_DICT \
116+
--fsdp-activation-checkpointing \
117+
--instruction-tuning \
118+
--save-sampler-state \
119+
--use-mpi \
120+
--wandb-entity "prj-jalm" \
121+
--wandb-project "Llama-3.1-8B-Instruct" \
122+
--wandb-name "${JOB_NAME}"

0 commit comments

Comments
 (0)