Skip to content

Commit 769ed77

Browse files
ko3n1gksivaman
authored andcommitted
ci: Build and attach bdist wheels to release page (#2138)
* ci: Build and attach bdist wheels to release page Signed-off-by: oliver könig <[email protected]> * free up space Signed-off-by: oliver könig <[email protected]> * cleanup Signed-off-by: oliver könig <[email protected]> * test Signed-off-by: oliver könig <[email protected]> * test Signed-off-by: oliver könig <[email protected]> * test Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * test Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * fix Signed-off-by: oliver könig <[email protected]> * c28619d8999a147d5e09c1199f84ff6af6ad5794 Signed-off-by: oliver könig <[email protected]> * c28619d8999a147d5e09c1199f84ff6af6ad5794 Signed-off-by: oliver könig <[email protected]> * Reduce months to check from 7 to 5 Signed-off-by: oliver könig <[email protected]> * Update .github/scripts/check_for_ngc_images.sh Signed-off-by: Kirthi Shankar Sivamani <[email protected]> * Update .github/actions/build-pytorch-wheel/build.sh Signed-off-by: Kirthi Shankar Sivamani <[email protected]> --------- Signed-off-by: oliver könig <[email protected]> Signed-off-by: Kirthi Shankar Sivamani <[email protected]> Co-authored-by: Kirthi Shankar Sivamani <[email protected]>
1 parent 6b815f8 commit 769ed77

File tree

8 files changed

+548
-24
lines changed

8 files changed

+548
-24
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# See LICENSE for license information.
4+
5+
FROM ubuntu:22.04
6+
7+
ENV DEBIAN_FRONTEND=noninteractive
8+
9+
ENV CUDA_HOME=/usr/local/cuda
10+
ENV PATH=$PATH:$CUDA_HOME/bin
11+
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
12+
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;9.0"
13+
14+
ARG PYTHON_VERSION=3.12
15+
ARG TORCH_VERSION=2.9.1
16+
ARG CUDA_VERSION=12.9.1
17+
ARG CUDNN_MAJOR_VERSION=9
18+
ENV PATH=/opt/venv/bin:$PATH
19+
ENV PYTHONUNBUFFERED=1
20+
ARG AARCH=x86_64
21+
22+
# Install Python
23+
RUN apt-get update && \
24+
apt-get install -y software-properties-common wget && \
25+
add-apt-repository ppa:deadsnakes/ppa -y && \
26+
apt-get install -y python$PYTHON_VERSION-dev python$PYTHON_VERSION-venv python3-pip && \
27+
python$PYTHON_VERSION -m venv /opt/venv
28+
29+
30+
# Install cuda-toolkit
31+
RUN CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1'}) && \
32+
CUDA_MINOR_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $2'}) && \
33+
rm /etc/apt/sources.list.d/cuda*.list || true && \
34+
rm /etc/apt/sources.list.d/nvidia-cuda.list || true && \
35+
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${AARCH}/cuda-keyring_1.1-1_all.deb && \
36+
dpkg -i cuda-keyring_1.1-1_all.deb && \
37+
rm cuda-keyring_1.1-1_all.deb && \
38+
apt-get update && \
39+
apt-get install -y cuda-toolkit-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} cudnn-cuda-$CUDA_MAJOR_VERSION libcudnn$CUDNN_MAJOR_VERSION-cuda-$CUDA_MAJOR_VERSION libnccl2 libnccl-dev cmake
40+
41+
# Install PyTorch
42+
RUN export MATRIX_CUDA_VERSION=$(echo $CUDA_VERSION | awk -F \. {'print $1 $2'}) && \
43+
export MATRIX_TORCH_VERSION=$(echo $TORCH_VERSION | awk -F \. {'print $1 "." $2'}) && \
44+
export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
45+
minv = {'2.5': 118, '2.6': 118, '2.7': 118, '2.8': 126, '2.9': 126}[env['MATRIX_TORCH_VERSION']]; \
46+
maxv = {'2.5': 124, '2.6': 126, '2.7': 128, '2.8': 129, '2.9': 130}[env['MATRIX_TORCH_VERSION']]; \
47+
print(minv if int(env['MATRIX_CUDA_VERSION']) < 120 else maxv)" \
48+
) && \
49+
pip install --no-cache-dir torch==${TORCH_VERSION} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# See LICENSE for license information.
4+
5+
name: Build PyTorch Wheel
6+
description: Builds a PyTorch wheel for TransformerEngine
7+
8+
inputs:
9+
release-version:
10+
description: 'The release version to use for the build'
11+
required: true
12+
python-version:
13+
description: 'The Python version to use for the build'
14+
required: true
15+
cuda-version:
16+
description: 'The CUDA version to use for the build'
17+
required: true
18+
cudnn-version:
19+
description: 'The cuDNN version to use for the build'
20+
required: true
21+
torch-version:
22+
description: 'The PyTorch version to use for the build'
23+
required: true
24+
cxx11_abi:
25+
description: 'Enable torch flag C++11 ABI (TRUE/FALSE)'
26+
required: true
27+
base-image:
28+
description: 'The base image to use for the build'
29+
required: false
30+
aarch:
31+
description: 'The architecture to use for the build'
32+
required: true
33+
outputs:
34+
wheel_name:
35+
description: 'The name of the built wheel'
36+
value: ${{ steps.build_wheel.outputs.wheel_name }}
37+
38+
runs:
39+
using: 'composite'
40+
steps:
41+
- name: Move /var/lib/docker/
42+
shell: bash -euxo pipefail {0}
43+
run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker"
44+
45+
- name: Maximize build space
46+
uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794
47+
with:
48+
root-reserve-mb: 5120
49+
temp-reserve-mb: 32
50+
swap-size-mb: 10240
51+
remove-dotnet: 'true'
52+
remove-android: 'true'
53+
remove-haskell: 'true'
54+
remove-codeql: 'true'
55+
build-mount-path: '/var/lib/docker/'
56+
57+
- name: Restore /var/lib/docker/
58+
shell: bash -euxo pipefail {0}
59+
run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker"
60+
61+
- name: Checkout
62+
uses: actions/checkout@v4
63+
with:
64+
ref: ${{ inputs.release-version }}
65+
submodules: recursive
66+
67+
- name: Checkout build tools
68+
uses: actions/checkout@v4
69+
with:
70+
path: build-tools
71+
submodules: recursive
72+
73+
- name: Build image
74+
shell: bash -euxo pipefail {0}
75+
env:
76+
BASE_IMAGE: ${{ inputs.base-image }}
77+
run: |
78+
if [[ "${BASE_IMAGE}" == "" ]]; then
79+
docker build \
80+
-t transformer-engine-build \
81+
-f build-tools/.github/actions/build-pytorch-wheel/Dockerfile \
82+
--build-arg PYTHON_VERSION=${{ inputs.python-version }} \
83+
--build-arg TORCH_VERSION=${{ inputs.torch-version }} \
84+
--build-arg CUDA_VERSION=${{ inputs.cuda-version }} \
85+
--build-arg CUDNN_MAJOR_VERSION=${{ inputs.cudnn-version }} \
86+
--build-arg AARCH=${{ inputs.aarch }} \
87+
.
88+
else
89+
docker pull ${BASE_IMAGE}
90+
docker tag ${BASE_IMAGE} transformer-engine-build
91+
fi
92+
- name: Build wheel
93+
shell: bash -euxo pipefail {0}
94+
id: build_wheel
95+
env:
96+
CXX11_ABI: ${{ inputs.cxx11_abi }}
97+
run: |
98+
echo ::group::Build wheel
99+
100+
EXIT_CODE=$(docker run \
101+
--rm \
102+
--shm-size=64g \
103+
--workdir /workspace/transformer_engine/pytorch \
104+
--volume $(pwd):/workspace \
105+
--volume $GITHUB_OUTPUT:$GITHUB_OUTPUT \
106+
-e PIP_CONSTRAINT= \
107+
-e CXX11_ABI=$CXX11_ABI \
108+
-e GITHUB_OUTPUT=$GITHUB_OUTPUT \
109+
transformer-engine-build bash /workspace/build-tools/.github/actions/build-pytorch-wheel/build.sh | tail -n 1)
110+
111+
# Do not fail the job if timeout killed the build
112+
exit $EXIT_CODE
113+
echo ::endgroup::
114+
115+
- name: Log Built Wheels
116+
shell: bash -euxo pipefail {0}
117+
run: |
118+
ls transformer_engine/pytorch/dist
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
3+
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
7+
set -eoxu pipefail
8+
9+
export NVTE_PYTORCH_FORCE_BUILD=TRUE
10+
export NVTE_NO_LOCAL_VERSION=1
11+
export NVTE_PYTORCH_FORCE_CXX11_ABI=$CXX11_ABI
12+
export PIP_CONSTRAINT=
13+
14+
pip install wheel packaging nvidia-mathdx ninja pybind11
15+
16+
# 5h timeout since GH allows max 6h and we want some buffer
17+
EXIT_CODE=0
18+
timeout 5h python setup.py bdist_wheel --dist-dir=dist || EXIT_CODE=$?
19+
20+
if [ $EXIT_CODE -eq 0 ]; then
21+
wheel_name=$(python -c "import setup; print(setup.get_wheel_url()[1])" | tail -n 1)
22+
ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
23+
echo "wheel_name=${wheel_name}" | tee -a "$GITHUB_OUTPUT"
24+
fi
25+
26+
echo $EXIT_CODE
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
#
5+
# See LICENSE for license information.
6+
7+
# Configuration
8+
BASE_IMAGE="nvcr.io/nvidia/pytorch"
9+
TAG_SUFFIX="-py3"
10+
MONTHS_TO_CHECK=5 # Check current month and previous 4 months (total 5)
11+
12+
# Initialize an array to store existing tags
13+
EXISTING_TAGS=()
14+
15+
echo "Checking for existence of the last ${MONTHS_TO_CHECK} NGC PyTorch images: ${BASE_IMAGE}:YY.MM${TAG_SUFFIX}"
16+
echo "---------------------------------------------------------------------"
17+
18+
# Loop through the last N months
19+
for i in $(seq 0 $((MONTHS_TO_CHECK - 1))); do
20+
# Calculate Year and Month for the tag
21+
CURRENT_YEAR=$(date +%Y)
22+
CURRENT_MONTH=$(date +%m)
23+
24+
# Calculate target month and year
25+
TARGET_DATE=$(date -d "$CURRENT_YEAR-$CURRENT_MONTH-01 -$i months" +%y.%m)
26+
27+
# Construct the full image tag and the tag-only string
28+
IMAGE_TAG="${TARGET_DATE}${TAG_SUFFIX}"
29+
FULL_IMAGE="${BASE_IMAGE}:${IMAGE_TAG}"
30+
31+
echo "Checking: ${FULL_IMAGE}"
32+
33+
# Use 'docker manifest inspect' to check for image existence without pulling.
34+
if docker manifest inspect "${FULL_IMAGE}" > /dev/null 2>&1; then
35+
echo "✅ EXISTS: Found."
36+
# Add the tag-only string to the array
37+
EXISTING_TAGS+=("nvcr.io/nvidia/pytorch:${IMAGE_TAG}")
38+
else
39+
echo "❌ MISSING: Not found."
40+
fi
41+
done
42+
43+
echo "---------------------------------------------------------------------"
44+
45+
## JSON Output Generation
46+
# This uses the collected array to build a JSON string.
47+
48+
# 1. Convert the shell array to a newline-separated string.
49+
TAGS_NL_SEP=$(printf "%s\n" "${EXISTING_TAGS[@]}")
50+
51+
# 2. Use jq to read the newline-separated list and format it into a JSON array.
52+
# . | split("\n") | .[:-1] reads the input, splits it by newline, and removes the trailing empty element.
53+
if command -v jq &> /dev/null; then
54+
JSON_STRING=$(echo -e "${TAGS_NL_SEP}" | jq -R -s 'split("\n") | .[:-1]')
55+
56+
echo "Generated JSON String of Existing Tags:"
57+
echo "${JSON_STRING}"
58+
59+
# Optional: Save the JSON string to a variable for further use
60+
# echo "JSON_STRING is now available in the shell if you source this script."
61+
else
62+
echo "WARNING: 'jq' is not installed. Cannot format output as JSON."
63+
echo "Found Tags: ${EXISTING_TAGS[*]}"
64+
fi
65+
66+
echo "---"
67+
echo "Check complete."
68+
69+
echo "${JSON_STRING}" > ngc_images.json

0 commit comments

Comments
 (0)