test: time aware scheduling e2e test #3371
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright 2025 NVIDIA CORPORATION | |
| # SPDX-License-Identifier: Apache-2.0 | |
| name: KAI Scheduler - Pull Request | |
| on: | |
| pull_request: | |
| types: [opened, reopened, synchronize] | |
| merge_group: | |
| types: [checks_requested] | |
| concurrency: | |
| group: ${{ github.event_name == 'merge_group' && github.ref || github.head_ref || github.run_id }} | |
| cancel-in-progress: true | |
| jobs: | |
| check-build-and-test-required: | |
| name: Check if build and test are required | |
| runs-on: ubuntu-latest | |
| outputs: | |
| code: ${{ steps.filter.outputs.code }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Check changed files | |
| uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| predicate-quantifier: "every" | |
| filters: | | |
| docs: | |
| - '**/*.md' | |
| - 'docs/**' | |
| code: | |
| - '**' | |
| - '!**/*.md' | |
| - '!docs/**' | |
| validate-and-test: | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| name: Validate & Test | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Cache Go build cache (Docker-mounted) | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/go-build-docker-gocache | |
| ~/.cache/go-build-docker-gopath | |
| key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }} | |
| restore-keys: | | |
| go-docker-${{ runner.os }}- | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: '1.24.4' | |
| cache: true | |
| - name: Run validation | |
| run: make validate | |
| - name: Run tests | |
| run: make test | |
| - name: Archive code coverage results | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: code-coverage | |
| path: coverage/coverage.out | |
| code-coverage-report: | |
| name: Code Coverage Report | |
| runs-on: ubuntu-latest | |
| needs: [ validate-and-test, check-build-and-test-required ] | |
| if: github.event_name != 'merge_group' && needs.check-build-and-test-required.outputs.code == 'true' | |
| steps: | |
| - uses: fgrosse/go-coverage-report@8c1d1a09864211d258937b1b1a5b849f7e4f2682 | |
| id: coverage_reporter | |
| with: | |
| coverage-artifact-name: "code-coverage" | |
| coverage-file-name: "coverage.out" | |
| root-package: "github.com/NVIDIA/KAI-scheduler" | |
| github-baseline-workflow-ref: update-coverage-badge.yaml | |
| skip-comment: true | |
| - name: Save coverage report to file | |
| env: | |
| REPORT_BODY: ${{ steps.coverage_reporter.outputs.coverage_report }} | |
| run: echo "$REPORT_BODY" > coverage-report.txt | |
| - name: Upload coverage report | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: coverage-report-for-comment | |
| path: coverage-report.txt | |
| - name: Save PR number | |
| run: echo "${{ github.event.number }}" > pr_number.txt | |
| - name: Upload PR number | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: pr-number-for-comment | |
| path: pr_number.txt | |
| build: | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| name: Build | |
| runs-on: ubuntu-latest | |
| outputs: | |
| package_version: ${{ steps.package_version.outputs.PACKAGE_VERSION }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Extract package version | |
| id: package_version | |
| run: | | |
| GIT_REV=$(git rev-parse --short HEAD | sed 's/^0*//') | |
| PACKAGE_VERSION=0.0.0-$GIT_REV | |
| echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_ENV | |
| echo "PACKAGE_VERSION=$PACKAGE_VERSION" >> $GITHUB_OUTPUT | |
| echo $PACKAGE_VERSION | |
| - name: Cache Go build cache (Docker-mounted) | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/go-build-docker-gocache | |
| ~/.cache/go-build-docker-gopath | |
| key: go-docker-${{ runner.os }}-${{ hashFiles('go.sum') }} | |
| restore-keys: | | |
| go-docker-${{ runner.os }}- | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: '1.24.4' | |
| cache: true | |
| - name: Move Docker Data to /mnt | |
| run: | | |
| sudo systemctl stop docker | |
| sudo mkdir -p /mnt/docker-data | |
| echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json | |
| sudo systemctl start docker | |
| docker info | grep "Docker Root Dir" | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Create image cache directory | |
| run: | | |
| sudo mkdir -p /mnt/images | |
| sudo chown -R $USER:$USER /mnt/images | |
| - name: Cache for docker images and helm chart | |
| uses: actions/cache@v4 | |
| with: | |
| path: /mnt/images | |
| key: images-${{ github.sha }} | |
| - name: Build docker images | |
| run: | | |
| make build DOCKER_BUILDX_ADDITIONAL_ARGS="--load --cache-from type=gha --cache-to type=gha,mode=max" VERSION=$PACKAGE_VERSION | |
| docker save $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION) | gzip > /mnt/images/docker_images.tgz | |
| - name: Build helm chart | |
| run: | | |
| helm package ./deployments/kai-scheduler -d ./charts --app-version $PACKAGE_VERSION --version $PACKAGE_VERSION | |
| cp charts/kai-scheduler-$PACKAGE_VERSION.tgz /mnt/images/ | |
| skip-build-and-test-message: | |
| name: Skip Build and Test Message | |
| needs: [ check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code != 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Skip message | |
| run: | | |
| echo "Skipping build and test since only documentation files (.md or docs/) were changed." | |
| e2e-tests: | |
| name: Run E2E Tests | |
| needs: [ build, check-build-and-test-required ] | |
| if: needs.check-build-and-test-required.outputs.code == 'true' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v6 | |
| - name: Move Docker Data to /mnt | |
| run: | | |
| sudo systemctl stop docker | |
| sudo mkdir -p /mnt/docker-data | |
| echo '{"data-root": "/mnt/docker-data"}' | sudo tee /etc/docker/daemon.json | |
| sudo systemctl start docker | |
| docker info | grep "Docker Root Dir" | |
| - name: Create images directory | |
| run: | | |
| sudo mkdir -p /mnt/images | |
| sudo chown -R $USER:$USER /mnt/images | |
| - name: Cache restore | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: /mnt/images | |
| key: images-${{ github.sha }} | |
| - name: Delete restored cache | |
| continue-on-error: true | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| gh cache delete "images-${{ github.sha }}" --repo ${{ github.repository }} | |
| - name: Create k8s Kind Cluster | |
| uses: helm/[email protected] | |
| with: | |
| cluster_name: kind | |
| version: v0.30.0 | |
| config: ./hack/e2e-kind-config.yaml | |
| - name: Deploy image registry | |
| run: | | |
| kubectl apply -f ./hack/local_registry.yaml | |
| kubectl wait --for=condition=available --timeout=60s deployment/registry -n kube-registry | |
| - name: Load docker images to registry | |
| env: | |
| PACKAGE_VERSION: ${{ needs.build.outputs.package_version }} | |
| run: | | |
| kubectl port-forward -n kube-registry deploy/registry 5000:5000 & | |
| docker load < /mnt/images/docker_images.tgz | |
| for image in $(docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION); do | |
| new_image=$(echo "$image" | sed -E 's|.*/([^/]+:[^/]+)$|localhost:5000/\1|') | |
| docker tag $image $new_image | |
| docker push $new_image | |
| done | |
| - name: Deploy fake gpu operator | |
| run: | | |
| helm upgrade -i gpu-operator oci://ghcr.io/run-ai/fake-gpu-operator/fake-gpu-operator --namespace gpu-operator --create-namespace \ | |
| --version 0.0.62 --values ./hack/fake-gpu-operator-values.yaml --wait | |
| - name: Deploy Prometheus Operator | |
| run: | | |
| helm repo add prometheus-community https://prometheus-community.github.io/helm-charts --force-update | |
| helm repo update prometheus-community | |
| helm install prometheus prometheus-community/kube-prometheus-stack --namespace monitoring --create-namespace \ | |
| --set "alertmanager.enabled=false" \ | |
| --set "grafana.enabled=false" \ | |
| --set "prometheus.enabled=false" \ | |
| --wait | |
| - name: install KAI-scheduler | |
| env: | |
| PACKAGE_VERSION: ${{ needs.build.outputs.package_version }} | |
| run: | | |
| helm upgrade -i kai-scheduler /mnt/images/kai-scheduler-$PACKAGE_VERSION.tgz -n kai-scheduler --create-namespace \ | |
| --set "global.gpuSharing=true" --set "global.registry=localhost:30100" --debug --wait | |
| - name: Set up Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version: '1.24.4' | |
| cache: true | |
| - name: Install ginkgo | |
| run: | | |
| go install github.com/onsi/ginkgo/v2/[email protected] | |
| - name: Delete images from disk | |
| env: | |
| PACKAGE_VERSION: ${{ needs.build.outputs.package_version }} | |
| run: | | |
| docker images --format '{{.Repository}}:{{.Tag}}' | grep $PACKAGE_VERSION | xargs docker rmi -f | |
| sudo rm -rf /mnt/images | |
| - name: Run e2e tests | |
| run: | | |
| ginkgo -r --keep-going --randomize-all --randomize-suites --trace -vv --label-filter '!autoscale && !scale' ./test/e2e/suites | |
| - name: Uninstall KAI-scheduler | |
| run: | | |
| helm uninstall kai-scheduler -n kai-scheduler | |
| echo "Waiting up to 60 seconds for pods to terminate..." | |
| EXCLUDED_PODS="prometheus" | |
| for i in {1..12}; do | |
| EXCLUDE_PATTERN=$(echo "$EXCLUDED_PODS" | tr ' ' '|') | |
| NON_TERM=$(kubectl get pods -n kai-scheduler --no-headers 2>/dev/null | grep -v Terminating | grep -vE "$EXCLUDE_PATTERN" | wc -l) | |
| if [ "$NON_TERM" -eq 0 ]; then | |
| echo "Only Terminating pods remain or no pods left (excluding: $EXCLUDED_PODS). Safe to proceed." | |
| exit 0 | |
| fi | |
| echo "Found $NON_TERM non-terminating pods (excluding: $EXCLUDED_PODS)... waiting (attempt $i/12)" | |
| sleep 5 | |
| done | |
| echo "Pods did not terminate within 60 seconds. Uninstall incomplete." | |
| exit 1 |