Skip to content

Merge pull request #805 from javipolo/not-blacklist-amdgpu #187

Merge pull request #805 from javipolo/not-blacklist-amdgpu

Merge pull request #805 from javipolo/not-blacklist-amdgpu #187

name: Training Bootc image builds
on:
push:
branches: [ main ]
paths:
- 'training/**'
- '.github/workflows/training_bootc.yaml'
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
env:
REGISTRY: quay.io
REGISTRY_ORG: ai-lab
REGION: us-east-1
jobs:
start-runner:
name: Start self-hosted EC2 runner
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@v2
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-0154957ba4ce98784
ec2-instance-type: m7i.12xlarge
subnet-id: subnet-0b1e1d94240813658
security-group-id: sg-055105753f5e8bd83
nvidia-bootc-builder-image:
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
strategy:
matrix:
include:
- image_name: nvidia-builder
context: training/nvidia-bootc
arch: amd64
runs-on: ${{ needs.start-runner.outputs.label }}
needs: start-runner
permissions:
contents: read
packages: write
steps:
- uses: actions/[email protected]
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE
run: |
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N ""
- name: Build Image
id: build_image
run: make driver-toolkit ARCH=${{ matrix.arch }}
working-directory: ${{ matrix.context }}
- name: tag image as nvidia-builder
run: podman tag ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/driver-toolkit:latest ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/${{ matrix.image_name}}:latest
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}
image: driver-toolkit
tags: latest
- name: push the nvidia-builder image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ matrix.image_name}}
tags: latest
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
nvidia-bootc-image:
strategy:
matrix:
include:
- image_name: nvidia-bootc
driver_version: "550.54.15"
context: training/nvidia-bootc
arch: amd64
runs-on: ${{ needs.start-runner.outputs.label }}
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
needs: [ nvidia-bootc-builder-image, start-runner ]
steps:
- uses: actions/[email protected]
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE and overwrite the existing one
run: |
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" <<<y
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: generate the local OCI assets
run: |
cd training
make -j vllm
make -j deepspeed
make -j instruct-nvidia
- name: Build Image
id: build_image
run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }} SSH_PUBKEY=~/.ssh/id_rsa.pub
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ matrix.image_name }}
tags: latest
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
bootc-images:
strategy:
matrix:
include:
- image_name: intel-bootc
context: training/intel-bootc
arch: amd64
gpu: intel
- image_name: amd-bootc
context: training/amd-bootc
arch: amd64
gpu: amd
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')"
runs-on: ${{ needs.start-runner.outputs.label }}
needs: [ start-runner, nvidia-bootc-builder-image ]
continue-on-error: true
steps:
- uses: actions/[email protected]
- name: mkdir root/.docker directory
run: |
mkdir -p ~/.docker
- name: Login to Container Registry
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }}
- name: generate the local OCI assets
run: |
cd training
make -j vllm
make -j deepspeed
make -j instruct-${{ matrix.gpu}}
- name: Build Image
id: build_image
run: make bootc ARCH=${{ matrix.arch }} INSTRUCTLAB_IMAGE=${{env.REGISTRY}}/${{env.REGISTRY_ORG}}/instruct-${{ matrix.gpu }}:latest
working-directory: ${{ matrix.context }}
- name: Push image
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: redhat-actions/[email protected]
with:
image: ${{ matrix.image_name }}
tags: latest
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}
- name: Publish Job Results to Slack
id: slack
if: always()
uses: slackapi/[email protected]
with:
payload: |
{
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- bootc-images
- nvidia-bootc-image
runs-on: ubuntu-latest
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@v2
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}