.github/workflows/e2e-nvidia-t4-x1.yml

# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA Tesla T4 x1)

on:
  push:
    branches:
      - main
      - release-*
    paths:
      - files~=\.py$
      - files=pyproject.toml
      - files=^requirements.*\.txt$
      - files=.github/workflows/e2e-nvidia-t4-x1.yml # This workflow
  pull_request_target:
    types:
      - opened
      - synchronize
      - reopened
    branches:
      - main
      - release-*
    paths:
      - files~=\.py$
      - files=pyproject.toml
      - files=^requirements.*\.txt$
      - files=.github/workflows/e2e-nvidia-t4-x1.yml # This workflow

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  start-runner:
    name: Start external EC2 runner
    runs-on: ubuntu-latest
    outputs:
      label: ${{ steps.start-ec2-runner.outputs.label }}
      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}
      - name: Start EC2 runner
        id: start-ec2-runner
        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
        with:
          mode: start
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          ec2-image-id: ami-00c51d9c1374eda97
          ec2-instance-type: g4dn.2xlarge
          subnet-id: subnet-02d230cffd9385bd4
          security-group-id: sg-06300447c4a5fbef3
          iam-role-name: instructlab-ci-runner
          aws-resource-tags: >
            [
              {"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
            ]

  e2e:
    name: E2E Test
    needs: start-runner
    runs-on: ${{ needs.start-runner.outputs.label }}

    # It is important that this job has no write permissions and has
    # no access to any secrets. This part (e2e) is where we are running
    # untrusted code from PRs.
    permissions: {}

    # No step-security/harden-runner since this is a self-hosted runner
    steps:
      # for debugging
      - name: Print environment state
        run: |
          echo "Current Working Directory: $PWD"
          echo "Files in Local Directory:"
          ls -l

      - name: Checkout instructlab/instructlab
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          repository: "instructlab/instructlab"
          path: "instructlab"
          fetch-depth: 0

      - name: Checkout instructlab/sdg
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          repository: "instructlab/sdg"
          path: "sdg"
          # https://github.com/actions/checkout/issues/249
          fetch-depth: 0

      # for debugging
      - name: Print environment state
        run: |
          echo "Current Working Directory: $PWD"
          echo "Files in Local Directory:"
          ls -l

      - name: Fetch and checkout PR
        id: fetch_pr
        if: github.event_name == 'pull_request_target'
        working-directory: ./sdg
        run: |
          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
          git checkout pr-${{ github.event.pull_request.number }}

      - name: Install system packages
        run: |
          cat /etc/os-release
          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel

      - name: Install instructlab
        working-directory: ./instructlab
        run: |
          export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
          python3.11 -m venv --upgrade-deps venv
          . venv/bin/activate
          nvidia-smi
          python3.11 -m pip cache remove llama_cpp_python

          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .

          # https://github.com/instructlab/instructlab/issues/1821
          # install with Torch and build dependencies installed
          python3.11 -m pip install packaging wheel setuptools-scm
          python3.11 -m pip install .[cuda]
        
      - name: Update instructlab-sdg library
        working-directory: ./sdg
        run: |
          . ../instructlab/venv/bin/activate
          pip install .
          pip install .[cuda]        

      - name: Run e2e test
        working-directory: ./instructlab
        run: |
          . venv/bin/activate
          ./scripts/basic-workflow-tests.sh -m

  stop-runner:
    name: Stop external EC2 runner
    needs:
      - start-runner
      - e2e
    runs-on: ubuntu-latest
    if: ${{ always() }}
    steps:
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}
      - name: Stop EC2 runner
        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
        with:
          mode: stop
          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
          label: ${{ needs.start-runner.outputs.label }}
          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

  e2e-workflow-complete:
    # we don't want to block PRs on failed EC2 cleanup
    # so not requiring "stop-runner" as well
    needs: ["start-runner", "e2e"]
    runs-on: ubuntu-latest
    steps:
      - name: E2E Workflow Complete
        run: echo "E2E Workflow Complete"