Skip to content

Commit

Permalink
Disable FP8 in Mcore integration test on older GPUs (#1357)
Browse files Browse the repository at this point in the history
Debug Mcore integration test

Avoid FP8 on Ampere and older. Generate synthetic data instead of depending on external data.

Signed-off-by: Tim Moon <[email protected]>
  • Loading branch information
timmoon10 authored Dec 6, 2024
1 parent d978e80 commit d8b13cb
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 4 deletions.
2 changes: 2 additions & 0 deletions qa/L1_pytorch_mcore_integration/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Megatron-LM
vocab.json
1 change: 1 addition & 0 deletions qa/L1_pytorch_mcore_integration/merges.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#version: 0.2
22 changes: 18 additions & 4 deletions qa/L1_pytorch_mcore_integration/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,27 @@ set -e
: ${TE_PATH:=/opt/transformerengine}
: ${MCORE_PATH:=${TE_PATH}/qa/L1_pytorch_mcore_integration/Megatron-LM}

# Check whether FP8 is supported
DEVICE_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n 1 | sed 's/[^0-9]//g')
if [[ ${DEVICE_ARCH} -ge 89 ]]; then
WITH_FP8=1
fi

# Download Megatron-LM if needed
if [ ! -d "${MCORE_PATH}" ]; then
pushd $(dirname ${MCORE_PATH})
git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
popd
fi

# Create mock vocab
VOCAB_FILE=${TE_PATH}/qa/L1_pytorch_mcore_integration/vocab.json
printf "" > ${VOCAB_FILE}
printf "{" >> ${VOCAB_FILE}
printf "\"<|endoftext|>\": 0" >> ${VOCAB_FILE}
seq 1 4095 | awk '{ printf(", \"%d\": %d", $1, $1) }' >> ${VOCAB_FILE}
printf "}" >> ${VOCAB_FILE}

# Megatron-LM invocation
COMMAND="
NVTE_TORCH_COMPILE=0
Expand All @@ -40,17 +54,17 @@ ${MCORE_PATH}/pretrain_gpt.py
--hidden-size 128
--num-attention-heads 8
--seq-length 128
--max-position-embeddings 2048
--max-position-embeddings 128
--micro-batch-size 1
--global-batch-size 8
--train-iters 10
--eval-iters 10
--lr 1e-4
--mock-data
--vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json
--merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt
--vocab-file ${VOCAB_FILE}
--merge-file ${TE_PATH}/qa/L1_pytorch_mcore_integration/merges.txt
--transformer-impl transformer_engine
--fp8-format hybrid
${WITH_FP8:+--fp8-format hybrid}
"
COMMAND=$(echo "${COMMAND}" | tr '\n' ' ')

Expand Down

0 comments on commit d8b13cb

Please sign in to comment.