diff --git a/.github/mergify.yml b/.github/mergify.yml index f33a7cc6..e5c69650 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -27,6 +27,21 @@ pull_request_rules: - -files~=^\.github/(actions|workflows)/.*\.ya?ml$ - -files~=^\.github/workflows/actionlint\. + # e2e medium workflow + - or: + - and: + - check-success~=e2e-medium-workflow-complete + - or: + - files~=\.py$ + - files=pyproject.toml + - files=^requirements.*\.txt$ + - files=.github/workflows/e2e-nvidia-a10g-x1.yml + - and: + - -files~=\.py$ + - -files=pyproject.toml + - -files~=^requirements.*\.txt$ + - -files=.github/workflows/e2e-nvidia-a10g-x1.yml + # e2e small workflow - or: - and: diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml new file mode 100644 index 00000000..d5b845a7 --- /dev/null +++ b/.github/workflows/e2e-nvidia-a10g-x1.yml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: E2E (NVIDIA A10G x1) + +on: + # run against every merge commit to 'main' and release branches + push: + branches: + - main + - release-* + # only run on PRs that touch certain regex paths + pull_request_target: + branches: + - main + - release-* + paths: + # note this should match the merging criteria in 'mergify.yml' + - '**.py' + - 'pyproject.toml' + - 'requirements**.txt' + - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow + +concurrency: + group: ${{ github.workflow }}-${{ github.event.number || github.ref }} + cancel-in-progress: true + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + start-medium-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Start EC2 runner + id: start-ec2-runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ami-01a89eee1adde309c + ec2-instance-type: g5.4xlarge + subnet-id: subnet-02d230cffd9385bd4 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} + ] + + e2e-medium-test: + needs: + - start-medium-ec2-runner + runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }} + + # It is important that this job has no write permissions and has + # no access to any secrets. This part (e2e) is where we are running + # untrusted code from PRs. + permissions: {} + + steps: + - name: Install Packages + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + + - name: Checkout instructlab/instructlab + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/instructlab" + path: "instructlab" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Checkout instructlab/sdg + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + with: + repository: "instructlab/sdg" + path: "sdg" + # https://github.com/actions/checkout/issues/249 + fetch-depth: 0 + + - name: Fetch and checkout PR + if: ${{ github.event_name == 'pull_request_target' }} + working-directory: ./sdg + run: | + git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} + git checkout pr-${{ github.event.pull_request.number }} + + - name: Install ilab + working-directory: ./instructlab + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" + export PATH="$PATH:$CUDA_HOME/bin" + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + nvidia-smi + python3.11 -m pip cache remove llama_cpp_python + + CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install . + + # https://github.com/instructlab/instructlab/issues/1821 + # install with Torch and build dependencies installed + python3.11 -m pip install packaging wheel setuptools-scm + python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt + + - name: Update instructlab-sdg library + working-directory: ./sdg + run: | + . ../instructlab/venv/bin/activate + pip install . + + - name: Check disk + run: | + df -h + + - name: Run e2e test + working-directory: ./instructlab + run: | + . venv/bin/activate + ./scripts/e2e-ci.sh -m + + stop-medium-ec2-runner: + needs: + - start-medium-ec2-runner + - e2e-medium-test + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Stop EC2 runner + uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-medium-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} + + e2e-medium-workflow-complete: + # we don't want to block PRs on failed EC2 cleanup + # so not requiring "stop-runner" as well + needs: ["start-medium-ec2-runner", "e2e-medium-test"] + runs-on: ubuntu-latest + steps: + - name: E2E Workflow Complete + run: echo "E2E Workflow Complete" diff --git a/.github/workflows/e2e-nvidia-a10g-x4.yml b/.github/workflows/e2e-nvidia-a10g-x4.yml deleted file mode 100644 index d7aa977d..00000000 --- a/.github/workflows/e2e-nvidia-a10g-x4.yml +++ /dev/null @@ -1,181 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -name: E2E (NVIDIA A10G x4 - full pipeline) - -on: - workflow_dispatch: - inputs: - pr_or_branch: - description: 'pull request number or branch name' - required: true - default: 'main' - -jobs: - start-runner: - name: Start external EC2 runner - runs-on: ubuntu-latest - outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} - - name: Start EC2 runner - id: start-ec2-runner - uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6 - with: - mode: start - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ami-00c51d9c1374eda97 - ec2-instance-type: g5.12xlarge - subnet-id: subnet-02d230cffd9385bd4 - security-group-id: sg-06300447c4a5fbef3 - iam-role-name: instructlab-ci-runner - aws-resource-tags: > - [ - {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, - {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, - {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, - {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} - ] - - e2e: - name: E2E Test - needs: start-runner - runs-on: ${{ needs.start-runner.outputs.label }} - - permissions: - pull-requests: write - - steps: - - name: Checkout - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - with: - fetch-depth: 0 - - - name: Determine if pr_or_branch is a PR number - id: check_pr - run: | - if [[ "${{ github.event.inputs.pr_or_branch }}" =~ ^[0-9]+$ ]]; then - echo "is_pr=true" >> "$GITHUB_OUTPUT" - else - echo "is_pr=false" >> "$GITHUB_OUTPUT" - fi - - - name: Check if gh cli is installed - id: gh_cli - run: | - if command -v gh &> /dev/null ; then - echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" - else - echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" - fi - - - name: Install gh CLI - if: steps.gh_cli.outputs.gh_cli_installed == 'false' - run: | - sudo dnf install 'dnf-command(config-manager)' -y - sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - sudo dnf install gh --repo gh-cli -y - - - name: test gh CLI - run: | - gh --version - - - name: set default repo - run: | - gh repo set-default ${{ github.server_url }}/${{ github.repository }} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Add comment to PR - if: steps.check_pr.outputs.is_pr == 'true' - run: | - gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Fetch and checkout PR - if: steps.check_pr.outputs.is_pr == 'true' - run: | - gh pr checkout ${{ github.event.inputs.pr_or_branch }} - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Checkout branch - if: steps.check_pr.outputs.is_pr == 'false' - run: | - git checkout ${{ github.event.inputs.pr_or_branch }} - - - name: Install Packages - run: | - cat /etc/os-release - sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel - - - name: Install ilab - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH" - python3.11 -m venv --upgrade-deps venv - . venv/bin/activate - git clone https://github.com/instructlab/instructlab - cd instructlab - sed 's/\[.*\]//' requirements.txt > constraints.txt - python3.11 -m pip cache remove llama_cpp_python - CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3.11 -m pip install --force-reinstall --no-binary llama_cpp_python -c constraints.txt llama_cpp_python - python3.11 -m pip install bitsandbytes - python3.11 -m pip install . - - - name: Install sdg - run: | - . venv/bin/activate - python3.11 -m pip install . - - - name: Run e2e test - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: | - . venv/bin/activate - cd instructlab - SKIP_TRAIN=1 ./scripts/e2e-custom.sh -mFM - - - name: Add comment to PR if the workflow failed - if: failure() && steps.check_pr.outputs.is_pr == 'true' - run: | - gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Add comment to PR if the workflow succeeded - if: success() && steps.check_pr.outputs.is_pr == 'true' - run: | - gh pr comment "${{ github.event.inputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - stop-runner: - name: Stop external EC2 runner - needs: - - start-runner - - e2e - runs-on: ubuntu-latest - if: ${{ always() }} - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.AWS_REGION }} - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2 - with: - mode: stop - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} diff --git a/README.md b/README.md index 176cf603..adc9c5ca 100644 --- a/README.md +++ b/README.md @@ -6,5 +6,6 @@ ![License](https://img.shields.io/github/license/instructlab/sdg) ![`e2e-nvidia-t4-x1.yaml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-t4-x1.yml/badge.svg?branch=main) +![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/sdg/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main) Python library for Synthetic Data Generation