Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c0a7312
fix small errors
Jan 14, 2026
5d1b27d
cicd tests config
Jan 14, 2026
038304c
update script
Jan 14, 2026
5691b0c
Update script for pretrain megatron wan testing
chtruong814 Jan 21, 2026
6c68a7e
update nightly/weekly scripts
Jan 21, 2026
3da7a88
Fix test
chtruong814 Jan 22, 2026
ff60785
Fix test env var eval
chtruong814 Jan 22, 2026
f4df286
Fix test script
chtruong814 Jan 23, 2026
73bf04e
Test
chtruong814 Jan 23, 2026
c2d34ab
Fix dockerfile
chtruong814 Jan 24, 2026
7baf76a
Add release stage
chtruong814 Jan 24, 2026
d5d7f43
Fix torchrun in docker
chtruong814 Jan 24, 2026
33e66e2
Create run_example.sh
chtruong814 Jan 25, 2026
5932ad0
Ensure RDZV_PORT is set
chtruong814 Jan 25, 2026
2ea9e6a
Update weekly tests
chtruong814 Jan 26, 2026
9ea3827
Fix weekly tests
chtruong814 Jan 26, 2026
d4a1172
Fix tests
chtruong814 Jan 26, 2026
e6c5837
Add comment to script
chtruong814 Jan 26, 2026
d261dfe
Merge remote-tracking branch 'origin/main' into chtruong/nightly-ci
chtruong814 Jan 31, 2026
7f3c2d9
Refactor megatron test scripts to allow pretrain dir and iters
chtruong814 Jan 31, 2026
af8c73b
Refactor automodel nightly tests
chtruong814 Jan 31, 2026
4b513b8
Ensure megatron wan video test loads checkpoint from image model
chtruong814 Jan 31, 2026
63c607b
Fix automodel overrides
chtruong814 Feb 1, 2026
9d01e1b
Fix checkpoint load for automodel
chtruong814 Feb 1, 2026
1db21eb
Refactor test location
chtruong814 Feb 4, 2026
f7bf200
Update Dockerfile to include DFM source
chtruong814 Feb 4, 2026
0ed6982
Move nightly weekly scripts to tests/scheduled
chtruong814 Feb 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ runs:
docker exec -t nemo_container_${{ github.run_id }} bash -c '
set -e
source /opt/venv/bin/activate
uv pip install --no-deps -e .

timeout $(( ${{ inputs.timeout }} * 60 ))s bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh || EXIT_CODE=$?

Expand Down
2 changes: 2 additions & 0 deletions dfm/src/megatron/model/wan/inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def cache_video(tensor, save_file=None, fps=30, suffix=".mp4", nrow=8, normalize
for _ in range(retry):
try:
# preprocess
if not tensor.is_floating_point():
tensor = tensor.float()
tensor = tensor.clamp(min(value_range), max(value_range))
tensor = torch.stack(
[
Expand Down
22 changes: 15 additions & 7 deletions docker/Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,25 @@ ENV UV_LINK_MODE=copy

# Create virtual environment
RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
# Fix torchrun shebang to use the venv
RUN sed -i '1s|#!/usr/bin/python|#!/usr/bin/env python|' /usr/local/bin/torchrun

# Copy dependency files and source code (needed for dynamic version resolution)
# Copy minimal DFM files for dependency installation
COPY pyproject.toml uv.lock ./
COPY dfm ./dfm
COPY dfm/__init__.py ./dfm/
COPY dfm/package_info.py ./dfm/

# Copy 3rdparty dependencies with minimal files for metadata resolution
# Copy Automodel
COPY 3rdparty/Automodel ./3rdparty/Automodel
# Copy minimal Automodel files for dependency installation
COPY 3rdparty/Automodel/pyproject.toml ./3rdparty/Automodel/
COPY 3rdparty/Automodel/nemo_automodel/__init__.py ./3rdparty/Automodel/nemo_automodel/
COPY 3rdparty/Automodel/nemo_automodel/package_info.py ./3rdparty/Automodel/nemo_automodel/

# Copy minimal Megatron-Bridge files for metadata (prevents full source build)
# Copy minimal Megatron-Bridge files for dependency installation
COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/__init__.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/
COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/package_info.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/

# Copy minimal Megatron-LM files for metadata (prevents full source build)
# Copy minimal Megatron-LM files for dependency installation
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
Expand All @@ -56,3 +60,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
uv sync --locked --only-group build && \
uv sync --link-mode copy --locked --all-extras --all-groups --no-install-project && \
uv cache prune

# Copy all source code and install DFM
COPY . .
RUN uv sync --link-mode copy --locked --all-extras --all-groups

This file was deleted.

This file was deleted.

153 changes: 153 additions & 0 deletions tests/run_sbatch_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/bin/bash

# Generic sbatch script for running example scripts
# Usage: sbatch --account=<account> --nodes=<nodes> --time=<time> examples/run_example.sh --example-script <path> --container <container> --checkpoint-base-dir <dir> --dataset-base-dir <dir> [--partition <partition>] [--mount <mount>]
#
# NUM_NODES and TIME should be parsed from the example script and passed to sbatch via --nodes and --time
#
# Optional environment variables (export before running sbatch):
# WANDB_API_KEY - Weights & Biases API key for experiment tracking
# HF_TOKEN - Hugging Face token for accessing gated models/datasets
# HF_HOME - Hugging Face cache directory

#SBATCH --partition=batch
#SBATCH --job-name=dfm-example

# Default values
MOUNT="/lustre:/lustre"

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--example-script)
EXAMPLE_SCRIPT="$2"
shift 2
;;
--container)
CONTAINER="$2"
shift 2
;;
--checkpoint-base-dir)
CHECKPOINT_BASE_DIR="$2"
shift 2
;;
--dataset-base-dir)
DATASET_BASE_DIR="$2"
shift 2
;;
--mount)
MOUNT="$2"
shift 2
;;
-h|--help)
echo "Usage: sbatch --account=<account> --nodes=<nodes> --time=<time> $0 --example-script <path> --container <container> --checkpoint-base-dir <dir> --dataset-base-dir <dir> [--mount <mount>]"
echo ""
echo "Required sbatch arguments:"
echo " --account Slurm account to use"
echo " --nodes Number of nodes (parse NUM_NODES from example script)"
echo " --time Job timeout (parse TIME from example script)"
echo ""
echo "Required script arguments:"
echo " --example-script Path to the torchrun script to execute"
echo " --container Container image to use"
echo " --checkpoint-base-dir Base directory for checkpoints"
echo " --dataset-base-dir Base directory for datasets"
echo ""
echo "Optional arguments:"
echo " --mount Slurm mount to use (default: /lustre:/lustre)"
echo ""
echo "Optional environment variables (set before running sbatch):"
echo " WANDB_API_KEY Weights & Biases API key for experiment tracking"
echo " HF_TOKEN Hugging Face token for accessing gated models/datasets"
echo " HF_HOME Hugging Face cache directory"
exit 0
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done

# Validate required arguments
if [[ -z "$EXAMPLE_SCRIPT" ]]; then
echo "Error: --example-script is required"
exit 1
fi

if [[ -z "$CONTAINER" ]]; then
echo "Error: --container is required"
exit 1
fi

if [[ -z "$CHECKPOINT_BASE_DIR" ]]; then
echo "Error: --checkpoint-base-dir is required"
exit 1
fi

if [[ -z "$DATASET_BASE_DIR" ]]; then
echo "Error: --dataset-base-dir is required"
exit 1
fi

# Validate example script exists
if [[ ! -f "$EXAMPLE_SCRIPT" ]]; then
echo "Error: Example script not found: $EXAMPLE_SCRIPT"
exit 1
fi

# Get the example script name for checkpoint directory
SCRIPT_NAME=$(basename "$EXAMPLE_SCRIPT" .sh)
CHECKPOINT_DIR=${CHECKPOINT_BASE_DIR}/${SCRIPT_NAME}

# compute rendezvous/master addresses and ports (avoid port collision)
MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
RDZV_PORT=${RDZV_PORT:-29500}
MASTER_PORT=${MASTER_PORT:-29600}

# create checkpoint directory
mkdir -p ${CHECKPOINT_DIR}
# create barrier directory for synchronization
BARRIER_DIR=${CHECKPOINT_DIR}/setup_barrier
rm -rf ${BARRIER_DIR}
mkdir -p ${BARRIER_DIR}

# Read the torchrun command from the example script
TORCHRUN_CMD=$(cat "${EXAMPLE_SCRIPT}")

cmd="
# Synchronization barrier to ensure all nodes have finished installation before starting torchrun
echo \"Node \${SLURM_NODEID} finished setup. Waiting for others...\"
touch \"${BARRIER_DIR}/node_\${SLURM_NODEID}.ready\"
while [ \$(ls -1 \"${BARRIER_DIR}\"/node_*.ready | wc -l) -lt ${SLURM_JOB_NUM_NODES} ]; do
sleep 5
done
echo \"All nodes ready. Starting training...\"

# Set environment variables for distributed training
export MASTER_ADDR=${MASTER_ADDR}
export MASTER_PORT=${MASTER_PORT}
export RDZV_PORT=${RDZV_PORT}
export CHECKPOINT_DIR=${CHECKPOINT_DIR}
export CHECKPOINT_BASE_DIR=${CHECKPOINT_BASE_DIR}
export DATASET_BASE_DIR=${DATASET_BASE_DIR}

# Optional environment variables (pass through if set)
${WANDB_API_KEY:+export WANDB_API_KEY=${WANDB_API_KEY}}
${HF_TOKEN:+export HF_TOKEN=${HF_TOKEN}}
${HF_HOME:+export HF_HOME=${HF_HOME}}

${TORCHRUN_CMD}
"

export CUDA_DEVICE_MAX_CONNECTIONS=1

echo "Running training script: ${EXAMPLE_SCRIPT}"
echo "CHECKPOINT_DIR: ${CHECKPOINT_DIR}"

srun --mpi=pmix \
--container-image="${CONTAINER}" --container-mounts="${MOUNT}" \
--no-container-mount-home \
--ntasks-per-node=1 \
-N ${SLURM_JOB_NUM_NODES} \
bash -c "${cmd}"
Loading