Skip to content

Commit

Permalink
[ci] Add AMDGPU relected ci (#6743)
Browse files Browse the repository at this point in the history
Issue: #6434

### Brief Summary
Add the logic of docker for AMDGPU ci
1. The llvm used which only contains AMDGPU and X86 targets is from
docker image
2. Using AMDGPU in docker requires that `/dev/kfd` and the directory
`/dev/dri` be mounted on. For `dev` user, there is no permission to
access these character devices by default.
3. `TI_WITH_CUDA=OFF`
4. `TI_RUN_RELEASE_TESTS=OFF`
5. Currently only run cpu-relected test
  • Loading branch information
galeselee authored Dec 6, 2022
1 parent f37718f commit 7be40a5
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 15 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/scripts/common-utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,29 @@ function ci-docker-run-gpu {
$@
}

function ci-docker-run-amdgpu {
for i in {0..9}; do
if xset -display ":$i" -q >/dev/null 2>&1; then
break
fi
done

if [ $? -ne 0 ]; then
echo "No display!"
exit 1
fi

ci-docker-run \
--device=/dev/kfd \
--device=/dev/dri \
--group-add=video \
-e DISPLAY=:$i \
-e GPU_TEST=ON \
-e AMDGPU_TEST=ON \
-v /tmp/.X11-unix:/tmp/.X11-unix \
$@
}

function setup-android-ndk-env {
export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/android-sdk/ndk-bundle}
export ANDROID_CMAKE_ARGS="-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DANDROID_NATIVE_API_LEVEL=29 -DANDROID_ABI=arm64-v8a"
Expand Down
27 changes: 17 additions & 10 deletions .github/workflows/scripts/unix-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,26 @@ set -ex
[[ "$IN_DOCKER" == "true" ]] && cd taichi

if [[ $OSTYPE == "linux-"* ]]; then
if [ ! -d ~/taichi-llvm-15 ]; then
pushd ~
if [ -f /etc/centos-release ] ; then
# FIXIME: prebuilt llvm15 on ubuntu didn't work on manylinux image of centos. Once that's fixed, remove this hack.
wget https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip
else
wget https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip
if [ ! -z "$AMDGPU_TEST" ]; then
sudo ln -s /usr/bin/clang++-10 /usr/bin/clang++
sudo ln -s /usr/bin/clang-10 /usr/bin/clang
sudo ln -s /usr/bin/ld.lld-10 /usr/bin/ld.lld
export LLVM_DIR="/taichi-llvm-15.0.0-linux"
else
if [ ! -d ~/taichi-llvm-15 ]; then
pushd ~
if [ -f /etc/centos-release ] ; then
# FIXIME: prebuilt llvm15 on ubuntu didn't work on manylinux image of centos. Once that's fixed, remove this hack.
wget https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip
else
wget https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip
fi
unzip taichi-llvm-15-linux.zip && rm taichi-llvm-15-linux.zip
popd
fi
unzip taichi-llvm-15-linux.zip && rm taichi-llvm-15-linux.zip
popd
export LLVM_DIR="$HOME/taichi-llvm-15"
fi

export LLVM_DIR="$HOME/taichi-llvm-15"
elif [ "$(uname -s):$(uname -m)" == "Darwin:arm64" ]; then
# The following commands are done manually to save time.
if [ ! -d ~/taichi-llvm-15-m1 ]; then
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/scripts/unix_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ setup_python

[[ "$IN_DOCKER" == "true" ]] && cd taichi

if [ ! -z "$AMDGPU_TEST" ]; then
sudo chmod 666 /dev/kfd
sudo chmod 666 /dev/dri/*
fi

python3 -m pip install dist/*.whl
if [ -z "$GPU_TEST" ]; then
python3 -m pip install -r requirements_test.txt
Expand Down Expand Up @@ -103,6 +108,9 @@ if [ -z "$GPU_TEST" ]; then
fi
python3 tests/run_tests.py -vr2 -t4 -k "not paddle" -a "$TI_WANTED_ARCHS"
fi
elif [ ! -z "$AMDGPU_TEST" ]; then
run-it cpu $(nproc)
# run-it amdgpu 4
else
run-it cuda 8
run-it cpu $(nproc)
Expand Down
68 changes: 68 additions & 0 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,74 @@ jobs:
path: taichi-release-tests/bad-compare/*
retention-days: 7

build_and_test_amdgpu_linux:
name: Build and Test (AMDGPU)
needs: check_files
timeout-minutes: ${{ github.event.schedule != '0 18 * * *' && 90 || 120 }}

runs-on: [self-hosted, amdgpu]


steps:
- uses: actions/checkout@v3
with:
submodules: 'recursive'
fetch-depth: '0'

- name: Prepare Environment
run: |
. .github/workflows/scripts/common-utils.sh
prepare-build-cache
echo CI_DOCKER_RUN_EXTRA_ARGS="-v $(pwd):/home/dev/taichi" >> $GITHUB_ENV
- name: Build & Install
run: |
[[ ${{needs.check_files.outputs.run_job}} == false ]] && exit 0
. .github/workflows/scripts/common-utils.sh
ci-docker-run-amdgpu --name taichi-build \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
/home/dev/taichi/.github/workflows/scripts/unix-build.sh
env:
PY: py38
PROJECT_NAME: taichi
TAICHI_CMAKE_ARGS: >-
-DTI_WITH_VULKAN:BOOL=OFF
-DTI_BUILD_TESTS:BOOL=ON
-DTI_WITH_CUDA:BOOL=OFF
- name: Test
id: test
run: |
[[ ${{needs.check_files.outputs.run_job}} == false ]] && exit 0
. .github/workflows/scripts/common-utils.sh
ci-docker-run-amdgpu --name taichi-test \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
/home/dev/taichi/.github/workflows/scripts/unix_test.sh
env:
PY: py38
TI_WANTED_ARCHS: 'cpu,amdgpu'
TI_DEVICE_MEMORY_GB: '1'
TI_RUN_RELEASE_TESTS: '0'

- name: Save wheel if test failed
if: failure() && steps.test.conclusion == 'failure'
uses: actions/upload-artifact@v3
with:
name: broken-wheel
path: dist/*
retention-days: 7

- name: Save Bad Captures
if: failure() && steps.test.conclusion == 'failure'
uses: actions/upload-artifact@v3
with:
name: bad-captures
path: taichi-release-tests/bad-compare/*
retention-days: 7


build_and_test_windows:
name: Build and Test Windows
Expand Down
6 changes: 2 additions & 4 deletions ci/Dockerfile.ubuntu.18.04.amdgpu
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ RUN apt-get update && \
# Install LLVM 15
WORKDIR /
# Make sure this URL gets updated each time there is a new prebuilt bin release
RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.1/taichi-llvm-15.0.0-linux.zip
RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.2/taichi-llvm-15.0.0-linux.zip
RUN unzip taichi-llvm-15.0.0-linux.zip && \
rm taichi-llvm-15.0.0-linux.zip
ENV PATH="/taichi-llvm-15.0.0-linux/bin:$PATH"
Expand All @@ -57,9 +57,7 @@ ENV CXX="clang++-10"

# Create non-root user for running the container
RUN useradd -m -s /bin/bash dev && \
usermod -a -G video dev && \
chmod 666 /dev/kfd && \
chmod 666 /dev/dri/*
usermod -a -G video dev
WORKDIR /home/dev
USER dev

Expand Down
5 changes: 4 additions & 1 deletion ci/Dockerfile.ubuntu.20.04.amdgpu
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// clang++-10 -> clang++ etc
// assets llvm v0.0.1 -> v0.0.2
// apt install lld-10
# Taichi Dockerfile for development
FROM rocm/dev-ubuntu-20.04:5.2

Expand Down Expand Up @@ -38,7 +41,7 @@ RUN apt-get update && \
# Install LLVM 15
WORKDIR /
# Make sure this URL gets updated each time there is a new prebuilt bin release
RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.1/taichi-llvm-15.0.0-linux.zip
RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.2/taichi-llvm-15.0.0-linux.zip
RUN unzip taichi-llvm-15.0.0-linux.zip && \
rm taichi-llvm-15.0.0-linux.zip
ENV PATH="/taichi-llvm-15.0.0-linux/bin:$PATH"
Expand Down

0 comments on commit 7be40a5

Please sign in to comment.