Skip to content

Merge pull request #697 from instructlab/dependabot/github_actions/py… #375

Merge pull request #697 from instructlab/dependabot/github_actions/py…

Merge pull request #697 from instructlab/dependabot/github_actions/py… #375

# SPDX-License-Identifier: Apache-2.0
name: Functional GPU (NVIDIA Tesla T4 x1)
on:
workflow_dispatch: {}
# run against every merge commit to 'main' and release branches
push:
branches:
- main
- release-*
# only run on PRs that touch certain regex paths
pull_request_target:
branches:
- main
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- "**.py"
- "pyproject.toml"
- "requirements**.txt"
- "constraints-dev.txt"
- "tox.ini"
- ".github/workflows/functional-gpu-nvidia-t4-x1.yml" # This workflow
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
LC_ALL: en_US.UTF-8
defaults:
run:
shell: bash
permissions:
contents: read
jobs:
start-small-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
steps:
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: instructlab/ci-actions
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of
# overwriting the current WORKDIR contents
path: ci-actions
ref: release-v0.1
sparse-checkout: |
actions/launch-ec2-runner-with-fallback
- name: Launch EC2 Runner with Fallback
id: launch-ec2-instance-with-fallback
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
env:
TMPDIR: "/tmp"
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
regions_config: >
[
{
"region": "us-east-2",
"subnets": {
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
},
{
"region": "us-east-1",
"subnets": {
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
},
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
}
]
try_spot_instance_first: false
ec2_instance_type: g4dn.2xlarge
aws_resource_tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
functional-gpu-small-test:
needs:
- start-small-ec2-runner
runs-on: ${{ needs.start-small-ec2-runner.outputs.label }}
# It is important that this job has no write permissions and has
# no access to any secrets. This part is where we are running
# untrusted code from PRs.
permissions: {}
steps:
- name: Install Packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Checkout instructlab/sdg
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
# https://github.com/actions/checkout/issues/249
fetch-depth: 0
- name: Fetch and checkout PR
if: github.event_name == 'pull_request_target'
run: |
git fetch origin pull/${{ github.event.pull_request.number }}/merge:pr-merge-${{ github.event.pull_request.number }}
git checkout pr-merge-${{ github.event.pull_request.number }}
git log -1 --format="%H %s"
- name: Install instructlab/sdg
run: |
export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
python3.11 -m pip install tox tox-gh>=1.2 -c constraints-dev.txt
python3.11 -m pip cache remove llama_cpp_python
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -r requirements-dev.txt -c constraints-dev.txt
- name: Check disk before tests
run: |
df -h
df -i
- name: Run functional gpu tests with tox
run: |
. venv/bin/activate
tox -e functional-gpu
- name: Check disk after tests
if: ${{ always() }}
run: |
df -h
df -i
stop-small-ec2-runner:
needs:
- start-small-ec2-runner
- functional-gpu-small-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ needs.start-small-ec2-runner.outputs.ec2-instance-region }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-small-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-small-ec2-runner.outputs.ec2-instance-id }}
functional-gpu-small-workflow-complete:
# we don't want to block PRs on failed EC2 cleanup
# so not requiring "stop-small-ec2-runner" as well
needs: ["start-small-ec2-runner", "functional-gpu-small-test"]
runs-on: ubuntu-latest
steps:
- name: Functional GPU Workflow Complete
run: echo "Functional GPU Workflow Complete"