Skip to content

E2E (NVIDIA L40S x4) Release Branch #26

E2E (NVIDIA L40S x4) Release Branch

E2E (NVIDIA L40S x4) Release Branch #26

# SPDX-License-Identifier: Apache-2.0
name: E2E (NVIDIA L40S x4) Release Branch
on:
schedule:
- cron: '0 19 * * 1,3' # Runs at 7PM UTC every Mon/Wed
workflow_dispatch: {}
env:
TMPDIR: /home/tmp
LATEST_SDG_RELEASE_BRANCH: release-v0.8
LATEST_ILAB_RELEASE_BRANCH: release-v0.26
jobs:
start-large-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of
# overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback
- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g6e.12xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
e2e-large-test:
needs:
- start-large-ec2-runner
runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
permissions:
pull-requests: write
steps:
- name: Install Packages
run: |
cat /etc/os-release
mkdir -p /home/tmp
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
- name: Checkout instructlab/instructlab
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: "instructlab/instructlab"
ref: ${{ env.LATEST_ILAB_RELEASE_BRANCH }}
path: "instructlab"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0
- name: Checkout instructlab/sdg
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
path: "sdg"
ref: ${{ env.LATEST_SDG_RELEASE_BRANCH }}
# https://github.com/actions/checkout/issues/249
fetch-depth: 0
- name: Install ilab
working-directory: ./instructlab
run: |
PYTHON="python3.11" ./scripts/install-ilab-with-cuda.sh
- name: Update instructlab-sdg library
working-directory: ./sdg
run: |
. ../instructlab/venv/bin/activate
# Patch out our own pin from the ilab repo constraints file
ilab_constraints=../instructlab/constraints-dev.txt
sed -i '/instructlab-sdg==/d' $ilab_constraints
# Since we reuse the virtual environment prepared using ilab
# constraints, we should stick to the same constraints when
# installing latest sdg.
#
# FIX: this is not ideal; a proper fix would require decoupling the
# two repos in CI: either by removing the job completely and relying
# on "sdk" (no ilab) test runs; or by preparing a separate
# constraints file that would consider both the requirements files
# for the sdg library AND for the ilab - so that they are
# consistent.
pip_install="pip install -c $ilab_constraints"
$pip_install .
- name: Check disk before tests
run: |
df -h
df -i
- name: Run e2e test
working-directory: ./instructlab
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
. venv/bin/activate
./scripts/e2e-ci.sh -l -p
- name: Check disk after tests
if: ${{ always() }}
run: |
df -h
df -i
- name: Send Discord notification for failure
if: failure()
uses: sarisia/actions-status-discord@11a0bfe3b50977e38aa2bd4a4ebd296415e83c19 # v1.15.4
with:
webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
status: ${{ job.status }}
title: "e2e-nvidia-l40s-x4"
description: |
Job in **${{ github.repository }}** running on branch `${{ env.LATEST_SDG_RELEASE_BRANCH }}` completed **with failures** ❌
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
color: 0xCB2431 # Red color for failure
- name: Send Discord notification for success
if: success()
uses: sarisia/actions-status-discord@11a0bfe3b50977e38aa2bd4a4ebd296415e83c19 # v1.15.4
with:
webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
status: ${{ job.status }}
title: "e2e-nvidia-l40s-x4"
description: |
Job in **${{ github.repository }}** running on branch `${{ env.LATEST_SDG_RELEASE_BRANCH }}` completed **successfully** ✅
Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details.
color: 0x28A745 # Green color for success
stop-large-ec2-runner:
needs:
- start-large-ec2-runner
- e2e-large-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-large-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}