Skip to content

Commit

Permalink
Merge branch 'maanug/refactor-metric-check' into 'main'
Browse files Browse the repository at this point in the history
Move metrics pytest into individual jobs

See merge request ADLR/megatron-lm!1257
  • Loading branch information
jaredcasper committed Mar 25, 2024
2 parents 3a70d14 + 212ce8d commit 7d40bd3
Show file tree
Hide file tree
Showing 13 changed files with 94 additions and 33 deletions.
14 changes: 1 addition & 13 deletions jet-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ jet-trigger:
JET_WORKLOADS_FILTER: "$_JET_FILTER"


jet-functional-results:
jet-results-summary:
stage: jet
image: gitlab-master.nvidia.com:5005/dl/jet/api:latest
tags:
Expand All @@ -80,15 +80,3 @@ jet-functional-results:
- if: $JET_CUSTOM_FILTER != "" && $CI_PIPELINE_SOURCE != 'merge_request_event'
when: always
- when: never

jet-compare-metrics:
extends: .jet_common
image: gitlab-master.nvidia.com:5005/dl/jet/api:latest
tags:
- os/linux
needs: [ jet-functional-results ]
before_script:
- jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $CI_JOB_JWT
script:
- python -m pip install -U --no-cache-dir pytest tensorboard
- python tests/functional_tests/python_test_utils/jet_test_pipeline.py ${CI_PIPELINE_ID} --test metrics
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/MR-bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ spec:
MBS={micro_batch_size} \
GBS={batch_size} \
CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
# MCore
- {tp_size: [2], pp_size: [2]}
Expand Down
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/MR-gpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@ spec:
MOE_GROUPED_GEMM={moe_grouped_gemm} \
CKPT_FORMAT={ckpt_format} \
CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
# MCore
- {tp_size: [2], pp_size: [2]}
Expand Down
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/MR-t5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ spec:
VP_SIZE={vp_size if vp_size is not None else '""'} \
MBS={micro_batch_size} \
GBS={batch_size} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
- {use_te: [True], tp_size: [1], pp_size: [1], vp_size: [1]}
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/monthly-t5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@ spec:
MBS={micro_batch_size} \
GBS={batch_size} \
CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
- { tp_size: [1,2], pp_size: [1], vp_size: [1] }
- {use_te: [True], tp_size: [2], pp_size: [1], vp_size: [1]}
Expand Down
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/nightly-bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,8 @@ spec:
VP_SIZE={vp_size if vp_size is not None else '""'} \
MBS={micro_batch_size} \
GBS={batch_size} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
- {tp_size: [1], pp_size: [4], vp_size: [2]}
- {use_mcore: [True, False], tp_size: [4], pp_size: [1]}
Expand Down
5 changes: 2 additions & 3 deletions tests/functional_tests/jet_recipes/nightly-gpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@ spec:
MBS={micro_batch_size} \
GBS={batch_size} \
MOE_GROUPED_GEMM={moe_grouped_gemm} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \
tee {assets_dir}/results.json
JOB_NAME={key.split("/")[1]} \
ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'}
products:
- {use_mcore: [True, False], tp_size: [4], pp_size: [1]}
- {use_mcore: [True, False], tp_size: [1], pp_size: [2,4]}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,8 @@ def collect_train_test_metrics(logs_dir, run_name):
},
"iteration_timing_avg": iteration_time_avg,
}
model_name = run_name.split('_')[0]
str_train_metrics = str(train_metrics).replace("'", "\"")
print(f"\n ----------- Store the following metrics in tests/functional_tests/test_results/${model_name}/{run_name}.json ----------")
print(f"\n ----------- Store the following metrics in tests/functional_tests/test_results/jet/{run_name}.json ----------")
print(f"\n {str_train_metrics}", flush=True)

if __name__ == '__main__':
Expand Down
8 changes: 8 additions & 0 deletions tests/functional_tests/python_test_utils/jet_test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,18 @@ def check_exitcodes(results):
exit_codes = []
log_urls = []
names = []
metrics_file_urls = []
for result in results:
exit_codes.append(result.get('l_exit_code', -1))
log_urls.append(select_asset(result, 'output_script-0.log'))
names.append(result['obj_workload']['s_key'].split('basic/')[-1])
metrics_file_urls.append(select_asset(result, 'results.json'))

metrics_table = PrettyTable()
metrics_table.add_column("Job Key", names)
metrics_table.add_column("Results Data", metrics_file_urls)
metrics_table.align["Job Key"] = 'l'
print(metrics_table)

table = PrettyTable()
table.add_column("Job Key", names)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,21 @@ echo "--------------------------------------------------------------------------

echo "$command" > $SCRIPTS_DIR/pretrain_bert_distributed_command.sh
eval $command

echo "Saving test results to $TENSORBOARD_DIR"
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
tee ${TENSORBOARD_DIR}/results.json

if [[ $SKIP_PYTEST != 1 ]]; then
echo "-----------------------------------------------------------------------------"
if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
echo "Running pytest 1st vs 2nd run comparison"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
else
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,21 @@ echo "--------------------------------------------------------------------------

echo "$command" > $SCRIPTS_DIR/pretrain_gpt3_distributed_command.sh
eval $command

echo "Saving test results to $TENSORBOARD_DIR"
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
tee ${TENSORBOARD_DIR}/results.json

if [[ $SKIP_PYTEST != 1 ]]; then
echo "-----------------------------------------------------------------------------"
if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
echo "Running pytest 1st vs 2nd run comparison"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
else
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,21 @@ pip install faiss-gpu

echo "$command" > $SCRIPTS_DIR/pretrain_retro_distributed_command.sh
eval $command

echo "Saving test results to $TENSORBOARD_DIR"
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
tee ${TENSORBOARD_DIR}/results.json

if [[ $SKIP_PYTEST != 1 ]]; then
echo "-----------------------------------------------------------------------------"
if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
echo "Running pytest 1st vs 2nd run comparison"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
else
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,21 @@ echo "--------------------------------------------------------------------------

echo "$command" > $SCRIPTS_DIR/pretrain_t5_distributed_command.sh
eval $command

echo "Saving test results to $TENSORBOARD_DIR"
python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \
tee ${TENSORBOARD_DIR}/results.json

if [[ $SKIP_PYTEST != 1 ]]; then
echo "-----------------------------------------------------------------------------"
if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then
echo "Running pytest 1st vs 2nd run comparison"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
else
echo "Running pytest checks against golden values"
export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json"
export LOGS_DIR=$TENSORBOARD_DIR
pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py
fi
fi

0 comments on commit 7d40bd3

Please sign in to comment.