Implement ciflow/rocm on Torchtitan (#2114)

akashveramd · huydhn · web-flow · commit 9bc50ea83498 · 2025-12-13T00:34:35.000-08:00
In this PR, I implemented ciflow/rocm on Torchtitan. The changes are
part of integration_test_8gpu_features.yaml. The workflow still supports
running on pull_request (without any PR label) for CUDA. However, along
with push to main and cron schedule, with the ciflow/8gpu label added to
PR, the workflow runs for both CUDA &amp; ROCm.

---------

Co-authored-by: Huy Do &lt;huydhn@gmail.com&gt;
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,6 @@
+"ciflow/8gpu":
+  - .ci/docker/**
+  - .github/workflows/**
+  - scripts/**
+  - tests/**
+  - torchtitan/**
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
@@ -0,0 +1,3 @@
+ciflow_push_tags:
+  - ciflow/8gpu
+labeler_config: labeler.yml
diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml
@@ -3,6 +3,8 @@ name: 8 GPU Feature Tests
 on:
   push:
     branches: [ main ]
+    tags:
+      - ciflow/8gpu/*
     paths-ignore:
       - 'torchtitan/experiments/**'
   pull_request:
@@ -27,33 +29,7 @@ permissions:
 jobs:
   # Step 1: Dynamically compute the matrix based on conditions
   set-matrix:
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set.outputs.matrix }}
-    steps:
-      - id: set
-        run: |
-          # Decide which matrix entries to include based on event type
-          if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
-          # Include both CUDA and ROCm
-          echo '{"include":[
-            {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
-            {"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
-            ]}' > matrix.json
-          else
-          # Include only CUDA
-          echo '{"include":[
-            {"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
-            ]}' > matrix.json
-          fi
-
-          # Export matrix to job outputs
-          {
-            echo 'matrix<<EOF'
-            cat matrix.json
-            echo 'EOF'
-          } >> $GITHUB_OUTPUT
-
+    uses: ./.github/workflows/set-matrix.yaml
 
   # Step 2: Use the dynamic matrix in the build-test job
   build-test:
diff --git a/.github/workflows/set-matrix.yaml b/.github/workflows/set-matrix.yaml
@@ -0,0 +1,76 @@
+name: Set Matrix
+
+on:
+  workflow_call:
+    outputs:
+      matrix:
+        description: dynamically set matrix
+        value: ${{ jobs.set.outputs.matrix }}
+
+jobs:
+  set:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set.outputs.matrix }}
+    env:
+      # Event flags evaluated by github actions before the step runs:
+      IS_MAIN_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+      IS_SCHEDULE:  ${{ github.event_name == 'schedule' }}
+      IS_8GPU_TAG:  ${{ startsWith(github.ref, 'refs/tags/ciflow/8gpu/') }}
+      TRIGGERED_8GPU_LABEL: ${{ github.event_name == 'pull_request' && github.event.action == 'labeled' }}
+
+    steps:
+      - id: set
+        run: |
+          # Define ROCm matrix
+          ROCM_MATRIX='{
+            "name": "rocm",
+            "runner": "linux.rocm.gpu.gfx942.8",
+            "gpu-arch-type": "rocm",
+            "gpu-arch-version": "7.0",
+            "docker-image": "torchtitan-rocm-ubuntu-22.04-clang12",
+            "index-url": "https://download.pytorch.org/whl/nightly/rocm7.0"
+          }'
+
+          # Define CUDA matrix
+          CUDA_MATRIX='{
+            "name": "cuda",
+            "runner": "linux.g5.48xlarge.nvidia.gpu",
+            "gpu-arch-type": "cuda",
+            "gpu-arch-version": "12.6",
+            "docker-image": "torchtitan-ubuntu-20.04-clang12",
+            "index-url": "https://download.pytorch.org/whl/nightly/cu126"
+          }'
+
+          # Use default value as 'false' for unset environment variables
+          IS_MAIN_PUSH="${IS_MAIN_PUSH:-false}"
+          IS_SCHEDULE="${IS_SCHEDULE:-false}"
+          IS_8GPU_TAG="${IS_8GPU_TAG:-false}"
+          TRIGGERED_8GPU_LABEL="${TRIGGERED_8GPU_LABEL:-false}"
+
+          # Decide which matrix entries to include based on event type
+          # Runs ROCm only for push tag OR when PR label gets triggered
+          if [[ "$IS_8GPU_TAG" == "true" || "$TRIGGERED_8GPU_LABEL" == "true" ]]; then
+            cat > matrix.json <<JSON
+          {"include": [$ROCM_MATRIX]}
+          JSON
+
+          # Runs CUDA and ROCm for normal PR (if PR label is present) OR for push to main, cron schedule
+          elif [[ ("$IS_MAIN_PUSH" == "true" || "$IS_SCHEDULE" == "true") ]]; then
+            cat > matrix.json <<JSON
+          {"include": [$CUDA_MATRIX,$ROCM_MATRIX]}
+          JSON
+
+          # Runs CUDA only as default (includes normal PR, if PR label is NOT present)
+          else
+            cat > matrix.json <<JSON
+          {"include": [$CUDA_MATRIX]}
+          JSON
+          fi
+
+          # Export matrix to job outputs
+          {
+            echo 'matrix<<EOF'
+            cat matrix.json
+            echo 'EOF'
+          } >> $GITHUB_OUTPUT

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+ciflow_push_tags:`
	`2`	`+ - ciflow/8gpu`
	`3`	`+labeler_config: labeler.yml`