NVIDIA-NeMo · chtruong814 · Jan 14, 2026 · Jan 14, 2026 · Jan 14, 2026 · Jan 21, 2026
@@ -188,7 +188,6 @@ runs:
         docker exec -t nemo_container_${{ github.run_id }} bash -c '
           set -e
           source /opt/venv/bin/activate
-          uv pip install --no-deps -e .
 
           timeout $(( ${{ inputs.timeout }} * 60 ))s bash tests/${{ inputs.is_unit_test == 'true' && 'unit_tests' || 'functional_tests' }}/${{ inputs.script }}.sh || EXIT_CODE=$?
 

diff --git a/dfm/src/megatron/model/wan/inference/utils.py b/dfm/src/megatron/model/wan/inference/utils.py
@@ -43,6 +43,8 @@ def cache_video(tensor, save_file=None, fps=30, suffix=".mp4", nrow=8, normalize
     for _ in range(retry):
         try:
             # preprocess
+            if not tensor.is_floating_point():
+                tensor = tensor.float()
             tensor = tensor.clamp(min(value_range), max(value_range))
             tensor = torch.stack(
                 [

@@ -28,21 +28,25 @@ ENV UV_LINK_MODE=copy
 
 # Create virtual environment
 RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
+# Fix torchrun shebang to use the venv
+RUN sed -i '1s|#!/usr/bin/python|#!/usr/bin/env python|' /usr/local/bin/torchrun
 
-# Copy dependency files and source code (needed for dynamic version resolution)
+# Copy minimal DFM files for dependency installation
 COPY pyproject.toml uv.lock ./
-COPY dfm ./dfm
+COPY dfm/__init__.py ./dfm/
+COPY dfm/package_info.py ./dfm/
 
-# Copy 3rdparty dependencies with minimal files for metadata resolution
-# Copy Automodel
-COPY 3rdparty/Automodel ./3rdparty/Automodel
+# Copy minimal Automodel files for dependency installation
+COPY 3rdparty/Automodel/pyproject.toml ./3rdparty/Automodel/
+COPY 3rdparty/Automodel/nemo_automodel/__init__.py ./3rdparty/Automodel/nemo_automodel/
+COPY 3rdparty/Automodel/nemo_automodel/package_info.py ./3rdparty/Automodel/nemo_automodel/
 
-# Copy minimal Megatron-Bridge files for metadata (prevents full source build)
+# Copy minimal Megatron-Bridge files for dependency installation
 COPY 3rdparty/Megatron-Bridge/pyproject.toml ./3rdparty/Megatron-Bridge/
 COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/__init__.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/
 COPY 3rdparty/Megatron-Bridge/src/megatron/bridge/package_info.py ./3rdparty/Megatron-Bridge/src/megatron/bridge/
 
-# Copy minimal Megatron-LM files for metadata (prevents full source build)
+# Copy minimal Megatron-LM files for dependency installation
 COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/pyproject.toml ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/
 COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/__init__.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
 COPY 3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/package_info.py ./3rdparty/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
@@ -56,3 +60,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     uv sync --locked --only-group build && \
     uv sync --link-mode copy --locked --all-extras --all-groups --no-install-project && \
     uv cache prune
+
+# Copy all source code and install DFM
+COPY . .
+RUN uv sync --link-mode copy --locked --all-extras --all-groups
diff --git a/...les/automodel/cicd_convergence_tests/wan21/nightly/launch_pretrain_wan21_nightly_image.sh b/...les/automodel/cicd_convergence_tests/wan21/nightly/launch_pretrain_wan21_nightly_image.sh
diff --git a/...les/automodel/cicd_convergence_tests/wan21/nightly/launch_pretrain_wan21_nightly_video.sh b/...les/automodel/cicd_convergence_tests/wan21/nightly/launch_pretrain_wan21_nightly_video.sh
diff --git a/tests/run_sbatch_test.sh b/tests/run_sbatch_test.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+
+# Generic sbatch script for running example scripts
+# Usage: sbatch --account=<account> --nodes=<nodes> --time=<time> examples/run_example.sh --example-script <path> --container <container> --checkpoint-base-dir <dir> --dataset-base-dir <dir> [--partition <partition>] [--mount <mount>]
+#
+# NUM_NODES and TIME should be parsed from the example script and passed to sbatch via --nodes and --time
+#
+# Optional environment variables (export before running sbatch):
+#   WANDB_API_KEY - Weights & Biases API key for experiment tracking
+#   HF_TOKEN      - Hugging Face token for accessing gated models/datasets
+#   HF_HOME       - Hugging Face cache directory
+
+#SBATCH --partition=batch
+#SBATCH --job-name=dfm-example
+
+# Default values
+MOUNT="/lustre:/lustre"
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --example-script)
+            EXAMPLE_SCRIPT="$2"
+            shift 2
+            ;;
+        --container)
+            CONTAINER="$2"
+            shift 2
+            ;;
+        --checkpoint-base-dir)
+            CHECKPOINT_BASE_DIR="$2"
+            shift 2
+            ;;
+        --dataset-base-dir)
+            DATASET_BASE_DIR="$2"
+            shift 2
+            ;;
+        --mount)
+            MOUNT="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: sbatch --account=<account> --nodes=<nodes> --time=<time> $0 --example-script <path> --container <container> --checkpoint-base-dir <dir> --dataset-base-dir <dir> [--mount <mount>]"
+            echo ""
+            echo "Required sbatch arguments:"
+            echo "  --account             Slurm account to use"
+            echo "  --nodes               Number of nodes (parse NUM_NODES from example script)"
+            echo "  --time                Job timeout (parse TIME from example script)"
+            echo ""
+            echo "Required script arguments:"
+            echo "  --example-script      Path to the torchrun script to execute"
+            echo "  --container           Container image to use"
+            echo "  --checkpoint-base-dir Base directory for checkpoints"
+            echo "  --dataset-base-dir    Base directory for datasets"
+            echo ""
+            echo "Optional arguments:"
+            echo "  --mount               Slurm mount to use (default: /lustre:/lustre)"
+            echo ""
+            echo "Optional environment variables (set before running sbatch):"
+            echo "  WANDB_API_KEY         Weights & Biases API key for experiment tracking"
+            echo "  HF_TOKEN              Hugging Face token for accessing gated models/datasets"
+            echo "  HF_HOME               Hugging Face cache directory"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Validate required arguments
+if [[ -z "$EXAMPLE_SCRIPT" ]]; then
+    echo "Error: --example-script is required"
+    exit 1
+fi
+
+if [[ -z "$CONTAINER" ]]; then
+    echo "Error: --container is required"
+    exit 1
+fi
+
+if [[ -z "$CHECKPOINT_BASE_DIR" ]]; then
+    echo "Error: --checkpoint-base-dir is required"
+    exit 1
+fi
+
+if [[ -z "$DATASET_BASE_DIR" ]]; then
+    echo "Error: --dataset-base-dir is required"
+    exit 1
+fi
+
+# Validate example script exists
+if [[ ! -f "$EXAMPLE_SCRIPT" ]]; then
+    echo "Error: Example script not found: $EXAMPLE_SCRIPT"
+    exit 1
+fi
+
+# Get the example script name for checkpoint directory
+SCRIPT_NAME=$(basename "$EXAMPLE_SCRIPT" .sh)
+CHECKPOINT_DIR=${CHECKPOINT_BASE_DIR}/${SCRIPT_NAME}
+
+# compute rendezvous/master addresses and ports (avoid port collision)
+MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+RDZV_PORT=${RDZV_PORT:-29500}
+MASTER_PORT=${MASTER_PORT:-29600}
+
+# create checkpoint directory
+mkdir -p ${CHECKPOINT_DIR}
+# create barrier directory for synchronization
+BARRIER_DIR=${CHECKPOINT_DIR}/setup_barrier
+rm -rf ${BARRIER_DIR}
+mkdir -p ${BARRIER_DIR}
+
+# Read the torchrun command from the example script
+TORCHRUN_CMD=$(cat "${EXAMPLE_SCRIPT}")
+
+cmd="
+# Synchronization barrier to ensure all nodes have finished installation before starting torchrun
+echo \"Node \${SLURM_NODEID} finished setup. Waiting for others...\"
+touch \"${BARRIER_DIR}/node_\${SLURM_NODEID}.ready\"
+while [ \$(ls -1 \"${BARRIER_DIR}\"/node_*.ready | wc -l) -lt ${SLURM_JOB_NUM_NODES} ]; do
+   sleep 5
+done
+echo \"All nodes ready. Starting training...\"
+
+# Set environment variables for distributed training
+export MASTER_ADDR=${MASTER_ADDR}
+export MASTER_PORT=${MASTER_PORT}
+export RDZV_PORT=${RDZV_PORT}
+export CHECKPOINT_DIR=${CHECKPOINT_DIR}
+export CHECKPOINT_BASE_DIR=${CHECKPOINT_BASE_DIR}
+export DATASET_BASE_DIR=${DATASET_BASE_DIR}
+
+# Optional environment variables (pass through if set)
+${WANDB_API_KEY:+export WANDB_API_KEY=${WANDB_API_KEY}}
+${HF_TOKEN:+export HF_TOKEN=${HF_TOKEN}}
+${HF_HOME:+export HF_HOME=${HF_HOME}}
+
+${TORCHRUN_CMD}
+"
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+echo "Running training script: ${EXAMPLE_SCRIPT}"
+echo "CHECKPOINT_DIR: ${CHECKPOINT_DIR}"
+
+srun --mpi=pmix \
+    --container-image="${CONTAINER}" --container-mounts="${MOUNT}" \
+    --no-container-mount-home \
+    --ntasks-per-node=1 \
+    -N ${SLURM_JOB_NUM_NODES} \
+    bash -c "${cmd}"