From 4aada1bedb0b8ea11633c6784ff0f720e9f358ea Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Thu, 16 Jan 2025 05:57:44 -0800 Subject: [PATCH] ADLR/megatron-lm!2541 - ci: Add frozen checkpoints --- tests/test_utils/python_scripts/launch_jet_workload.py | 2 ++ tests/test_utils/recipes/bert.yaml | 2 +- tests/test_utils/recipes/gpt.yaml | 4 ++-- tests/test_utils/recipes/t5.yaml | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py index 7b2be02fdc..dce2515ada 100644 --- a/tests/test_utils/python_scripts/launch_jet_workload.py +++ b/tests/test_utils/python_scripts/launch_jet_workload.py @@ -274,12 +274,14 @@ def main( logger.error(e) time.sleep((3**n_download_attempt) * 60) n_download_attempt += 1 + no_log = True except KeyError as e: logger.error(e) break no_log = True if no_log: + logger.error("Did not find any logs to download, retry.") continue concat_logs = "\n".join(logs) diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml index 0d422e00b0..e1e0a23cf9 100644 --- a/tests/test_utils/recipes/bert.yaml +++ b/tests/test_utils/recipes/bert.yaml @@ -13,7 +13,7 @@ spec: n_repeat: artifacts: /workspace/data/bert_data: text/the_pile/bert_shard00 - # /workspace/checkpoints/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev: model/mcore_bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev/22390338 + /workspace/checkpoints/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev: model/mcore_bert/bert_mr_mcore_tp2_pp2_frozen_resume_torch_dist_dgx_a100_1N8G_dev/22410107 script: |- ls cd /opt/megatron-lm diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml index 117f778e16..1937133319 100644 --- a/tests/test_utils/recipes/gpt.yaml +++ b/tests/test_utils/recipes/gpt.yaml @@ -12,8 +12,8 @@ spec: platforms: dgx_a100 artifacts: /workspace/data/gpt3_data: text/the_pile/shard00 - # /workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev/22390338 - # /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev/22390338 + /workspace/checkpoints/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp4_pp2_frozen_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G_dev/22410107 + /workspace/checkpoints/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev: model/mcore_gpt/gpt3_mr_mcore_te_tp2_pp1_frozen_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G_dev/22410107 script: |- ls cd /opt/megatron-lm diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml index 2f81fe33ec..5aa91522c1 100644 --- a/tests/test_utils/recipes/t5.yaml +++ b/tests/test_utils/recipes/t5.yaml @@ -11,7 +11,7 @@ spec: platforms: dgx_a100 artifacts: /workspace/data/t5_data: text/the_pile/t5_shard00 - # /workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev/22390338 + /workspace/checkpoints/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev: model/mcore_t5/t5_220m_mr_mcore_te_tp2_pp2_frozen_resume_torch_dgx_a100_1N8G_dev/22410107 script: |- ls cd /opt/megatron-lm