Skip to content

Commit

Permalink
Merge pull request #260 from nathan-weinberg/aws-runner
Browse files Browse the repository at this point in the history
ci: move E2E runner from github to AWS
  • Loading branch information
nathan-weinberg authored Aug 30, 2024
2 parents 75e3825 + 4ee8804 commit e0e67a0
Show file tree
Hide file tree
Showing 3 changed files with 167 additions and 105 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/e2e-nvidia-a10g-x4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-runner"},
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
Expand Down
166 changes: 166 additions & 0 deletions .github/workflows/e2e-nvidia-t4-x1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA Tesla T4 x1)

on:
push:
branches:
- main
- release-*
pull_request_target:
types:
- opened
- synchronize
- reopened
branches:
- main
- release-*

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
start-runner:
name: Start external EC2 runner
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ami-00c51d9c1374eda97
ec2-instance-type: g4dn.2xlarge
subnet-id: subnet-02d230cffd9385bd4
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
e2e:
name: E2E Test
needs: start-runner
runs-on: ${{ needs.start-runner.outputs.label }}

# It is important that this job has no write permissions and has
# no access to any secrets. This part (e2e) is where we are running
# untrusted code from PRs.
permissions: {}

# No step-security/harden-runner since this is a self-hosted runner
steps:
# for debugging
- name: Print environment state
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l
- name: Checkout instructlab/instructlab
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: "instructlab/instructlab"
path: "instructlab"
fetch-depth: 0

- name: Checkout instructlab/sdg
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: "instructlab/sdg"
path: "sdg"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

# for debugging
- name: Print environment state
run: |
echo "Current Working Directory: $PWD"
echo "Files in Local Directory:"
ls -l
- name: Fetch and checkout PR
id: fetch_pr
if: github.event_name == 'pull_request_target'
working-directory: ./sdg
run: |
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
git checkout pr-${{ github.event.pull_request.number }}
- name: Install system packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Install instructlab
working-directory: ./instructlab
run: |
export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
python3.11 -m pip cache remove llama_cpp_python
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install packaging wheel setuptools-scm
python3.11 -m pip install .[cuda]
- name: Update instructlab-sdg library
working-directory: ./sdg
run: |
. ../instructlab/venv/bin/activate
pip install .
pip install .[cuda]
- name: Run e2e test
working-directory: ./instructlab
run: |
. venv/bin/activate
./scripts/basic-workflow-tests.sh -m
stop-runner:
name: Stop external EC2 runner
needs:
- start-runner
- e2e
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Stop EC2 runner
uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

e2e-workflow-complete:
# we don't want to block PRs on failed EC2 cleanup
# so not requiring "stop-runner" as well
needs: ["start-runner", "e2e"]
runs-on: ubuntu-latest
steps:
- name: E2E Workflow Complete
run: echo "E2E Workflow Complete"
104 changes: 0 additions & 104 deletions .github/workflows/e2e.yml

This file was deleted.

0 comments on commit e0e67a0

Please sign in to comment.