Skip to content

test: time aware scheduling e2e test #3371

test: time aware scheduling e2e test

test: time aware scheduling e2e test #3371

Workflow file for this run

# Copyright 2025 NVIDIA CORPORATION
# SPDX-License-Identifier: Apache-2.0
name: KAI Scheduler - Pull Request
on:
pull_request:
types: [opened, reopened, synchronize]
merge_group:
types: [checks_requested]
concurrency:
group: ${{ github.event_name == 'merge_group' && github.ref || github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
check-build-and-test-required:
name: Check if build and test are required
runs-on: ubuntu-latest
outputs:
code: ${{ steps.filter.outputs.code }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Check changed files
uses: dorny/paths-filter@v3
id: filter
with:
predicate-quantifier: "every"
filters: |
docs:
- '**/*.md'
- 'docs/**'
code:
- '**'
- '!**/*.md'
- '!docs/**'
validate-and-test:
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
name: Validate & Test
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Cache Go build cache (Docker-mounted)
uses: actions/cache@v4
with:
path: |
~/.cache/go-build-docker-gocache
~/.cache/go-build-docker-gopath
key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
restore-keys: |
go-docker-${{ runner.os }}-
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24.4'
cache: true
- name: Run validation
run: make validate
- name: Run tests
run: make test
- name: Archive code coverage results
uses: actions/upload-artifact@v4
with:
name: code-coverage
path: coverage/coverage.out
code-coverage-report:
name: Code Coverage Report
runs-on: ubuntu-latest
needs: [ validate-and-test, check-build-and-test-required ]
if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true'
steps:
- uses: fgrosse/go-coverage-report@8c1d1a09864211d258937b1b1a5b849f7e4f2682
id: coverage_reporter
with:
coverage-artifact-name: "code-coverage"
coverage-file-name: "coverage.out"
root-package: "github.com/NVIDIA/KAI-scheduler"
github-baseline-workflow-ref: update-coverage-badge.yaml
skip-comment: true
- name: Save coverage report to file
env:
REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }}
run: echo "$REPORT_BODY" > coverage-report.txt
- name: Upload coverage report
uses: actions/upload-artifact@v4
with:
name: coverage-report-for-comment
path: coverage-report.txt
- name: Save PR number
run: echo "${{ github.event.number }}" > pr_number.txt
- name: Upload PR number
uses: actions/upload-artifact@v4
with:
name: pr-number-for-comment
path: pr_number.txt
build:
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
name: Build
runs-on: ubuntu-latest
outputs:
package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Extract package version
id: package_version
run: |
GIT_REV=$(git rev-parse --short HEAD | sed 's/^0*//')
PACKAGE_VERSION=0.0.0-$GIT_REV
echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV
echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT
echo $PACKAGE_VERSION
- name: Cache Go build cache (Docker-mounted)
uses: actions/cache@v4
with:
path: |
~/.cache/go-build-docker-gocache
~/.cache/go-build-docker-gopath
key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }}
restore-keys: |
go-docker-${{ runner.os }}-
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24.4'
cache: true
- name: Move Docker Data to /mnt
run: |
sudo systemctl stop docker
sudo mkdir -p /mnt/docker-data
echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json
sudo systemctl start docker
docker info | grep "Docker Root Dir"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Create image cache directory
run: |
sudo mkdir -p /mnt/images
sudo chown -R $USER:$USER /mnt/images
- name: Cache for docker images and helm chart
uses: actions/cache@v4
with:
path: /mnt/images
key: images-${{ github.sha }}
- name: Build docker images
run: |
make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION
docker save $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION) | gzip > /mnt/images/docker_images.tgz
- name: Build helm chart
run: |
helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION
cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/
skip-build-and-test-message:
name: Skip Build and Test Message
needs: [ check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code != 'true'
runs-on: ubuntu-latest
steps:
- name: Skip message
run: |
echo "Skipping build and test since only documentation files (.md or docs/) were changed."
e2e-tests:
name: Run E2E Tests
needs: [ build, check-build-and-test-required ]
if: needs.check-build-and-test-required.outputs.code == 'true'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Move Docker Data to /mnt
run: |
sudo systemctl stop docker
sudo mkdir -p /mnt/docker-data
echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json
sudo systemctl start docker
docker info | grep "Docker Root Dir"
- name: Create images directory
run: |
sudo mkdir -p /mnt/images
sudo chown -R $USER:$USER /mnt/images
- name: Cache restore
uses: actions/cache/restore@v4
with:
path: /mnt/images
key: images-${{ github.sha }}
- name: Delete restored cache
continue-on-error: true
env:
GH_TOKEN: ${{ github.token }}
run: |
gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }}
- name: Create k8s Kind Cluster
uses: helm/[email protected]
with:
cluster_name: kind
version: v0.30.0
config: ./hack/e2e-kind-config.yaml
- name: Deploy image registry
run: |
kubectl apply -f ./hack/local_registry.yaml
kubectl wait --for=condition=available --timeout=60s deployment/registry -n kube-registry
- name: Load docker images to registry
env:
PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
run: |
kubectl port-forward -n kube-registry deploy/registry 5000:5000 &
docker load < /mnt/images/docker_images.tgz
for image in $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION); do
new_image=$(echo "$image" | sed -E 's|.*/([^/]+:[^/]+)$|localhost:5000/\1|')
docker tag $image $new_image
docker push $new_image
done
- name: Deploy fake gpu operator
run: |
helm upgrade -i gpu-operator oci://ghcr.io/run-ai/fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \
--version 0.0.62 --values ./hack/fake-gpu-operator-values.yaml --wait
- name: Deploy Prometheus Operator
run: |
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update
helm repo update prometheus-community
helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace \
--set "alertmanager.enabled=false" \
--set "grafana.enabled=false" \
--set "prometheus.enabled=false" \
--wait
- name: install KAI-scheduler
env:
PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
run: |
helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \
--set "global.gpuSharing=true" --set "global.registry=localhost:30100" --debug --wait
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24.4'
cache: true
- name: Install ginkgo
run: |
go install github.com/onsi/ginkgo/v2/[email protected]
- name: Delete images from disk
env:
PACKAGE_VERSION: ${{ needs.build.outputs.package_version }}
run: |
docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION | xargs docker rmi -f
sudo rm -rf /mnt/images
- name: Run e2e tests
run: |
ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale' ./test/e2e/suites
- name: Uninstall KAI-scheduler
run: |
helm uninstall kai-scheduler -n kai-scheduler
echo "Waiting up to 60 seconds for pods to terminate..."
EXCLUDED_PODS="prometheus"
for i in {1..12}; do
EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" | tr ' ' '|')
NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null | grep -v Terminating | grep -vE "$EXCLUDE_PATTERN" | wc -l)
if [ "$NON_TERM" -eq 0 ]; then
echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed."
exit 0
fi
echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)"
sleep 5
done
echo "Pods did not terminate within 60 seconds. Uninstall incomplete."
exit 1