Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add converter script for 13B CPT #59

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
# Model conversion script for converting Megatron format checkpoints into Hugging Face format
#
# This script needs one node on the `gpu` partition of the cluster.
# However, a GPU is necessary to verify CUDA functionality, even though no VRAM will be used.
#
# Usage:
# On a cluster with SLURM:
# Run `sbatch --partition {partition} convert.sh SOURCE_DIR TARGET_DIR`
# On a cluster without SLURM:
# Run `bash convert.sh SOURCE_DIR TARGET_DIR TEMPORAL_DIR > outpus/convert.out 2> outputs/convert.err`
# - SOURCE_DIR: Megatron checkpoint directory including `iter_NNNNNNN`
# - TARGET_DIR: Output directory for the Hugging Face format
#
# Example:
# sbatch convert.sh /data/experiments/{exp-id}/checkpoints/iter_0001000 /data/experiments/{exp-id}/hf_checkpoints/iter_0001000
#
#SBATCH --job-name=ckpt-convert
#SBATCH --partition=<FIX_ME>
#SBATCH --nodes=1
#SBATCH --gpus=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem=400G
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -e

MEGATRON_CHECKPOINT_DIR=${1%/}
HF_CHECKPOINT_DIR=$2

ENV_DIR=environment_train

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

TOKENIZER_MODEL_DIR=${ENV_DIR}/src/llm-jp-tokenizer/hf/ver3.0/llm-jp-tokenizer-100k.ver3.0b2

TARGET_ITER_DIR=$(basename $MEGATRON_CHECKPOINT_DIR) # iter_NNNNNNN
ITER=$(( 10#$(echo $TARGET_ITER_DIR | sed 's/^iter_//') )) # NNNNNNN (no 0 padding)
echo ITER=$ITER

if [[ -z "$ITER" || ! "$ITER" =~ ^[0-9]+$ ]]; then # check if directory is valid
>&2 echo "Error: ITER=$ITER is not a valid number. Exiting."
exit 1
fi

# Create a unique temporal working directory to avoid affecting the original directory and
# to allow multiple runs to execute simultaneously.
TMP_DIR=$(mktemp -d "${HOME}/ckpt_convert.XXXXXXXX")
>&2 echo TMP_DIR=$TMP_DIR
ln -s $(readlink -f $MEGATRON_CHECKPOINT_DIR) ${TMP_DIR}/${TARGET_ITER_DIR}
echo $ITER > "${TMP_DIR}/latest_checkpointed_iteration.txt"

echo "Converting $MEGATRON_CHECKPOINT_DIR"

python ${ENV_DIR}/src/Megatron-LM/tools/checkpoint/convert.py \
--model-type GPT \
--loader mcore \
--saver llama2_hf \
--load-dir $TMP_DIR \
--save-dir $HF_CHECKPOINT_DIR \
--hf-tokenizer-path $TOKENIZER_MODEL_DIR \
--save-dtype bfloat16 \
--loader-transformer-impl "transformer_engine" \
--megatron-path ${ENV_DIR}/src/Megatron-LM

cp ${TOKENIZER_MODEL_DIR}/* $HF_CHECKPOINT_DIR

rm -r $TMP_DIR
echo "Done"
37 changes: 37 additions & 0 deletions pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/run_convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash

src_root=/home/shared/experiments/0066_v3-13b-cpt-lr/checkpoints
dest_root=/home/shared/experiments/0066_v3-13b-cpt-lr/checkpoints_hf

for src_exp_dir in ${src_root}/exp*; do
exp_rel=$(basename ${src_exp_dir})

for src_ckpt_dir in ${src_exp_dir}/iter_???????; do
ckpt_rel=${exp_rel}/$(basename ${src_ckpt_dir})
dest_ckpt_dir=${dest_root}/${ckpt_rel}

if [ -e ${dest_ckpt_dir} ]; then
echo "Exists: ${ckpt_rel}"
continue
fi

mkdir -p ${dest_ckpt_dir}

sbatch \
--job-name=0066_convert \
--partition=gpu-small \
--priority=1 \
--mem=200G \
scripts/pretrain/scripts/v3-13b-exp4-cpt-lr-sakura/convert.sh \
${src_ckpt_dir} \
${dest_ckpt_dir}
sbatch_result=$?

if [ ${sbatch_result} -eq 0 ]; then
echo "Queued: ${ckpt_rel}"
else
echo "Error: ${ckpt_rel}"
rmdir ${dest_ckpt_dir}
fi
done
done