From 111f8b1d04fe0efb8193211514b39eeb64903673 Mon Sep 17 00:00:00 2001 From: odashi Date: Sat, 26 Oct 2024 21:00:54 +0900 Subject: [PATCH 1/2] add converter script --- .../v3-13b-exp4-cpt-lr-sakura/run_convert.sh | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh diff --git a/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh new file mode 100644 index 0000000..e571792 --- /dev/null +++ b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +src_root=/home/shared/experiments/0066_v3-13b-cpt-lr/checkpoints +dest_root=/home/shared/experiments/0066_v3-13b-cpt-lr/checkpoints_hf + +for src_exp_dir in ${src_root}/exp*; do + exp_rel=$(basename ${src_exp_dir}) + + for src_ckpt_dir in ${src_exp_dir}/iter_???????; do + ckpt_rel=${exp_rel}/$(basename ${src_ckpt_dir}) + dest_ckpt_dir=${dest_root}/${ckpt_rel} + + if [ -e ${dest_ckpt_dir} ]; then + echo "Exists: ${ckpt_rel}" + continue + fi + + mkdir -p ${dest_ckpt_dir} + + sbatch \ + --job-name=0066_convert \ + --partition=gpu-small \ + --priority=1 \ + scripts/pretrain/scripts/v3-converter/convert.sh \ + ${src_ckpt_dir} \ + ${dest_ckpt_dir} + sbatch_result=$? + + if [ ${sbatch_result} -eq 0 ]; then + echo "Queued: ${ckpt_rel}" + else + echo "Error: ${ckpt_rel}" + rmdir ${dest_ckpt_dir} + fi + done +done From 4dc08919631b8bd32328b3305fb7ca3905b33676 Mon Sep 17 00:00:00 2001 From: odashi Date: Sat, 26 Oct 2024 21:17:58 +0900 Subject: [PATCH 2/2] fix script --- .../v3-13b-exp4-cpt-lr-sakura/convert.sh | 72 +++++++++++++++++++ .../v3-13b-exp4-cpt-lr-sakura/run_convert.sh | 3 +- 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh diff --git a/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh new file mode 100644 index 0000000..b683a65 --- /dev/null +++ b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# Model conversion script for converting Megatron format checkpoints into Hugging Face format +# +# This script needs one node on the `gpu` partition of the cluster. +# However, a GPU is necessary to verify CUDA functionality, even though no VRAM will be used. +# +# Usage: +# On a cluster with SLURM: +# Run `sbatch --partition {partition} convert.sh SOURCE_DIR TARGET_DIR` +# On a cluster without SLURM: +# Run `bash convert.sh SOURCE_DIR TARGET_DIR TEMPORAL_DIR > outpus/convert.out 2> outputs/convert.err` +# - SOURCE_DIR: Megatron checkpoint directory including `iter_NNNNNNN` +# - TARGET_DIR: Output directory for the Hugging Face format +# +# Example: +# sbatch convert.sh /data/experiments/{exp-id}/checkpoints/iter_0001000 /data/experiments/{exp-id}/hf_checkpoints/iter_0001000 +# +#SBATCH --job-name=ckpt-convert +#SBATCH --partition= +#SBATCH --nodes=1 +#SBATCH --gpus=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=400G +#SBATCH --output=outputs/%x-%j.out +#SBATCH --error=outputs/%x-%j.err + +set -e + +MEGATRON_CHECKPOINT_DIR=${1%/} +HF_CHECKPOINT_DIR=$2 + +ENV_DIR=environment_train + +source ${ENV_DIR}/scripts/environment.sh +source ${ENV_DIR}/venv/bin/activate + +TOKENIZER_MODEL_DIR=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.0/llm-jp-tokenizer-100k.ver3.0b2 + +TARGET_ITER_DIR=$(basename $MEGATRON_CHECKPOINT_DIR) # iter_NNNNNNN +ITER=$(( 10#$(echo $TARGET_ITER_DIR | sed 's/^iter_//') )) # NNNNNNN (no 0 padding) +echo ITER=$ITER + +if [[ -z "$ITER" || ! "$ITER" =~ ^[0-9]+$ ]]; then # check if directory is valid + >&2 echo "Error: ITER=$ITER is not a valid number. Exiting." + exit 1 +fi + +# Create a unique temporal working directory to avoid affecting the original directory and +# to allow multiple runs to execute simultaneously. +TMP_DIR=$(mktemp -d "${HOME}/ckpt_convert.XXXXXXXX") +>&2 echo TMP_DIR=$TMP_DIR +ln -s $(readlink -f $MEGATRON_CHECKPOINT_DIR) ${TMP_DIR}/${TARGET_ITER_DIR} +echo $ITER > "${TMP_DIR}/latest_checkpointed_iteration.txt" + +echo "Converting $MEGATRON_CHECKPOINT_DIR" + +python ${ENV_DIR}/src/Megatron-LM/tools/checkpoint/convert.py \ + --model-type GPT \ + --loader mcore \ + --saver llama2_hf \ + --load-dir $TMP_DIR \ + --save-dir $HF_CHECKPOINT_DIR \ + --hf-tokenizer-path $TOKENIZER_MODEL_DIR \ + --save-dtype bfloat16 \ + --loader-transformer-impl "transformer_engine" \ + --megatron-path ${ENV_DIR}/src/Megatron-LM + +cp ${TOKENIZER_MODEL_DIR}/* $HF_CHECKPOINT_DIR + +rm -r $TMP_DIR +echo "Done" diff --git a/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh index e571792..bd51ff5 100644 --- a/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh +++ b/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh @@ -21,7 +21,8 @@ for src_exp_dir in ${src_root}/exp*; do --job-name=0066_convert \ --partition=gpu-small \ --priority=1 \ - scripts/pretrain/scripts/v3-converter/convert.sh \ + --mem=200G \ + scripts/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh \ ${src_ckpt_dir} \ ${dest_ckpt_dir} sbatch_result=$?