From 212ce8dafefa053ebde42f2a3351efb17f5ed2a6 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Mon, 25 Mar 2024 14:19:41 -0700 Subject: [PATCH] Move metrics pytest into individual jobs --- jet-tests.yml | 14 +------------- .../functional_tests/jet_recipes/MR-bert.yaml | 5 ++--- tests/functional_tests/jet_recipes/MR-gpt.yaml | 5 ++--- tests/functional_tests/jet_recipes/MR-t5.yaml | 5 ++--- .../jet_recipes/monthly-t5.yaml | 5 ++--- .../jet_recipes/nightly-bert.yaml | 5 ++--- .../jet_recipes/nightly-gpt.yaml | 5 ++--- .../get_test_results_from_tensorboard_logs.py | 3 +-- .../python_test_utils/jet_test_pipeline.py | 8 ++++++++ .../bert/pretrain_bert_distributed_test.sh | 18 ++++++++++++++++++ .../gpt3/pretrain_gpt3_distributed_test.sh | 18 ++++++++++++++++++ .../retro/pretrain_retro_distributed_test.sh | 18 ++++++++++++++++++ .../t5/pretrain_t5_distributed_test.sh | 18 ++++++++++++++++++ 13 files changed, 94 insertions(+), 33 deletions(-) diff --git a/jet-tests.yml b/jet-tests.yml index 780fa94862..5fdaa65a6e 100644 --- a/jet-tests.yml +++ b/jet-tests.yml @@ -63,7 +63,7 @@ jet-trigger: JET_WORKLOADS_FILTER: "$_JET_FILTER" -jet-functional-results: +jet-results-summary: stage: jet image: gitlab-master.nvidia.com:5005/dl/jet/api:latest tags: @@ -80,15 +80,3 @@ jet-functional-results: - if: $JET_CUSTOM_FILTER != "" && $CI_PIPELINE_SOURCE != 'merge_request_event' when: always - when: never - -jet-compare-metrics: - extends: .jet_common - image: gitlab-master.nvidia.com:5005/dl/jet/api:latest - tags: - - os/linux - needs: [ jet-functional-results ] - before_script: - - jet secrets jwt-login jwt/nvidia/gitlab-master adlr-megatron-lm-ci $CI_JOB_JWT - script: - - python -m pip install -U --no-cache-dir pytest tensorboard - - python tests/functional_tests/python_test_utils/jet_test_pipeline.py ${CI_PIPELINE_ID} --test metrics diff --git a/tests/functional_tests/jet_recipes/MR-bert.yaml b/tests/functional_tests/jet_recipes/MR-bert.yaml index c43532d36d..e197c227f6 100644 --- a/tests/functional_tests/jet_recipes/MR-bert.yaml +++ b/tests/functional_tests/jet_recipes/MR-bert.yaml @@ -45,9 +45,8 @@ spec: MBS={micro_batch_size} \ GBS={batch_size} \ CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: # MCore - {tp_size: [2], pp_size: [2]} diff --git a/tests/functional_tests/jet_recipes/MR-gpt.yaml b/tests/functional_tests/jet_recipes/MR-gpt.yaml index 40db7c4364..b322a4ce3a 100644 --- a/tests/functional_tests/jet_recipes/MR-gpt.yaml +++ b/tests/functional_tests/jet_recipes/MR-gpt.yaml @@ -51,9 +51,8 @@ spec: MOE_GROUPED_GEMM={moe_grouped_gemm} \ CKPT_FORMAT={ckpt_format} \ CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: # MCore - {tp_size: [2], pp_size: [2]} diff --git a/tests/functional_tests/jet_recipes/MR-t5.yaml b/tests/functional_tests/jet_recipes/MR-t5.yaml index 31e00096e0..49548ad68c 100644 --- a/tests/functional_tests/jet_recipes/MR-t5.yaml +++ b/tests/functional_tests/jet_recipes/MR-t5.yaml @@ -43,8 +43,7 @@ spec: VP_SIZE={vp_size if vp_size is not None else '""'} \ MBS={micro_batch_size} \ GBS={batch_size} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - {use_te: [True], tp_size: [1], pp_size: [1], vp_size: [1]} diff --git a/tests/functional_tests/jet_recipes/monthly-t5.yaml b/tests/functional_tests/jet_recipes/monthly-t5.yaml index 1b8263899f..0c5cabd17d 100644 --- a/tests/functional_tests/jet_recipes/monthly-t5.yaml +++ b/tests/functional_tests/jet_recipes/monthly-t5.yaml @@ -45,9 +45,8 @@ spec: MBS={micro_batch_size} \ GBS={batch_size} \ CHECKPOINT_RESUME_TEST={checkpoint_resume_test} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - { tp_size: [1,2], pp_size: [1], vp_size: [1] } - {use_te: [True], tp_size: [2], pp_size: [1], vp_size: [1]} diff --git a/tests/functional_tests/jet_recipes/nightly-bert.yaml b/tests/functional_tests/jet_recipes/nightly-bert.yaml index e3b42128c5..84b1c8cf56 100644 --- a/tests/functional_tests/jet_recipes/nightly-bert.yaml +++ b/tests/functional_tests/jet_recipes/nightly-bert.yaml @@ -43,9 +43,8 @@ spec: VP_SIZE={vp_size if vp_size is not None else '""'} \ MBS={micro_batch_size} \ GBS={batch_size} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - {tp_size: [1], pp_size: [4], vp_size: [2]} - {use_mcore: [True, False], tp_size: [4], pp_size: [1]} diff --git a/tests/functional_tests/jet_recipes/nightly-gpt.yaml b/tests/functional_tests/jet_recipes/nightly-gpt.yaml index 3e26c51acb..166636f1fd 100644 --- a/tests/functional_tests/jet_recipes/nightly-gpt.yaml +++ b/tests/functional_tests/jet_recipes/nightly-gpt.yaml @@ -47,9 +47,8 @@ spec: MBS={micro_batch_size} \ GBS={batch_size} \ MOE_GROUPED_GEMM={moe_grouped_gemm} \ - ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} && \ - python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py {assets_dir} "" | \ - tee {assets_dir}/results.json + JOB_NAME={key.split("/")[1]} \ + ADDITIONAL_PARAMS={extra_args if extra_args is not None else '""'} products: - {use_mcore: [True, False], tp_size: [4], pp_size: [1]} - {use_mcore: [True, False], tp_size: [1], pp_size: [2,4]} diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py index cfb0772a04..5356282df7 100644 --- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py +++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py @@ -59,9 +59,8 @@ def collect_train_test_metrics(logs_dir, run_name): }, "iteration_timing_avg": iteration_time_avg, } - model_name = run_name.split('_')[0] str_train_metrics = str(train_metrics).replace("'", "\"") - print(f"\n ----------- Store the following metrics in tests/functional_tests/test_results/${model_name}/{run_name}.json ----------") + print(f"\n ----------- Store the following metrics in tests/functional_tests/test_results/jet/{run_name}.json ----------") print(f"\n {str_train_metrics}", flush=True) if __name__ == '__main__': diff --git a/tests/functional_tests/python_test_utils/jet_test_pipeline.py b/tests/functional_tests/python_test_utils/jet_test_pipeline.py index b2c44f21cc..05f82eb33b 100644 --- a/tests/functional_tests/python_test_utils/jet_test_pipeline.py +++ b/tests/functional_tests/python_test_utils/jet_test_pipeline.py @@ -44,10 +44,18 @@ def check_exitcodes(results): exit_codes = [] log_urls = [] names = [] + metrics_file_urls = [] for result in results: exit_codes.append(result.get('l_exit_code', -1)) log_urls.append(select_asset(result, 'output_script-0.log')) names.append(result['obj_workload']['s_key'].split('basic/')[-1]) + metrics_file_urls.append(select_asset(result, 'results.json')) + + metrics_table = PrettyTable() + metrics_table.add_column("Job Key", names) + metrics_table.add_column("Results Data", metrics_file_urls) + metrics_table.align["Job Key"] = 'l' + print(metrics_table) table = PrettyTable() table.add_column("Job Key", names) diff --git a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh index 8a3bee48b8..50cfc83cfc 100755 --- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh +++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh @@ -103,3 +103,21 @@ echo "-------------------------------------------------------------------------- echo "$command" > $SCRIPTS_DIR/pretrain_bert_distributed_command.sh eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh index 8a240c547c..53cdc096b5 100755 --- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh +++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh @@ -161,3 +161,21 @@ echo "-------------------------------------------------------------------------- echo "$command" > $SCRIPTS_DIR/pretrain_gpt3_distributed_command.sh eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh index 0d7203bdc6..446853fec1 100755 --- a/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh +++ b/tests/functional_tests/test_scripts/retro/pretrain_retro_distributed_test.sh @@ -148,3 +148,21 @@ pip install faiss-gpu echo "$command" > $SCRIPTS_DIR/pretrain_retro_distributed_command.sh eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi diff --git a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh index c093b35909..86107f4cfe 100755 --- a/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh +++ b/tests/functional_tests/test_scripts/t5/pretrain_t5_distributed_test.sh @@ -129,3 +129,21 @@ echo "-------------------------------------------------------------------------- echo "$command" > $SCRIPTS_DIR/pretrain_t5_distributed_command.sh eval $command + +echo "Saving test results to $TENSORBOARD_DIR" +python3 ./tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $TENSORBOARD_DIR "$JOB_NAME" | \ + tee ${TENSORBOARD_DIR}/results.json + +if [[ $SKIP_PYTEST != 1 ]]; then + echo "-----------------------------------------------------------------------------" + if [[ $CHECKPOINT_RESUME_TEST -eq 1 ]]; then + echo "Running pytest 1st vs 2nd run comparison" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py + else + echo "Running pytest checks against golden values" + export EXPECTED_METRICS_FILE="./tests/functional_tests/test_results/jet/${JOB_NAME}.json" + export LOGS_DIR=$TENSORBOARD_DIR + pytest ./tests/functional_tests/python_test_utils/test_ci_pipeline.py + fi +fi