Skip to content

Commit

Permalink
Merge branch 'ko3n1g/ci/retry-on-missing-downloads' into 'main'
Browse files Browse the repository at this point in the history
ci: Retry on failed logs

See merge request ADLR/megatron-lm!2540
  • Loading branch information
ko3n1g committed Jan 15, 2025
2 parents 699a0ec + 4364bfb commit 004fbcb
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions tests/test_utils/python_scripts/launch_jet_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,18 @@ def main(
n_download_attempt += 1
except KeyError as e:
logger.error(e)
break
no_log = True

if no_log:
continue

concat_logs = "\n".join(logs)
if concat_logs.strip() == "":
logger.error("No logs found. Try again.")
n_attempts += 1
continue

print(f"Logs:\n{concat_logs}")

success = pipeline.get_status() == PipelineStatus.SUCCESS
Expand All @@ -298,9 +304,7 @@ def main(
"Some NCCL operations have failed or timed out." in concat_logs
or "uncorrectable ECC error encountered" in concat_logs
or "illegal memory access" in concat_logs
or "illegal instruction" in concat_logs
or "NCCL WARN [Service thread] Accept failed Resource temporarily unavailable"
in concat_logs
or "illegal instruction" in concat_logs in concat_logs
):
logger.error("Detected NCCL failure, attempt restart.")
n_attempts += 1
Expand Down

0 comments on commit 004fbcb

Please sign in to comment.