From 7be40a5f9e62e610aac03f29eda4d4148f9eb168 Mon Sep 17 00:00:00 2001 From: Zeyu Li Date: Wed, 7 Dec 2022 01:59:00 +0800 Subject: [PATCH] [ci] Add AMDGPU relected ci (#6743) Issue: https://github.com/taichi-dev/taichi/issues/6434 ### Brief Summary Add the logic of docker for AMDGPU ci 1. The llvm used which only contains AMDGPU and X86 targets is from docker image 2. Using AMDGPU in docker requires that `/dev/kfd` and the directory `/dev/dri` be mounted on. For `dev` user, there is no permission to access these character devices by default. 3. `TI_WITH_CUDA=OFF` 4. `TI_RUN_RELEASE_TESTS=OFF` 5. Currently only run cpu-relected test --- .github/workflows/scripts/common-utils.sh | 23 ++++++++ .github/workflows/scripts/unix-build.sh | 27 +++++---- .github/workflows/scripts/unix_test.sh | 8 +++ .github/workflows/testing.yml | 68 +++++++++++++++++++++++ ci/Dockerfile.ubuntu.18.04.amdgpu | 6 +- ci/Dockerfile.ubuntu.20.04.amdgpu | 5 +- 6 files changed, 122 insertions(+), 15 deletions(-) diff --git a/.github/workflows/scripts/common-utils.sh b/.github/workflows/scripts/common-utils.sh index a0da18266ff3d..aa81fd9043d76 100644 --- a/.github/workflows/scripts/common-utils.sh +++ b/.github/workflows/scripts/common-utils.sh @@ -155,6 +155,29 @@ function ci-docker-run-gpu { $@ } +function ci-docker-run-amdgpu { + for i in {0..9}; do + if xset -display ":$i" -q >/dev/null 2>&1; then + break + fi + done + + if [ $? -ne 0 ]; then + echo "No display!" + exit 1 + fi + + ci-docker-run \ + --device=/dev/kfd \ + --device=/dev/dri \ + --group-add=video \ + -e DISPLAY=:$i \ + -e GPU_TEST=ON \ + -e AMDGPU_TEST=ON \ + -v /tmp/.X11-unix:/tmp/.X11-unix \ + $@ +} + function setup-android-ndk-env { export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/android-sdk/ndk-bundle} export ANDROID_CMAKE_ARGS="-DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DANDROID_NATIVE_API_LEVEL=29 -DANDROID_ABI=arm64-v8a" diff --git a/.github/workflows/scripts/unix-build.sh b/.github/workflows/scripts/unix-build.sh index 75ddafb970530..31cd8546c3279 100755 --- a/.github/workflows/scripts/unix-build.sh +++ b/.github/workflows/scripts/unix-build.sh @@ -7,19 +7,26 @@ set -ex [[ "$IN_DOCKER" == "true" ]] && cd taichi if [[ $OSTYPE == "linux-"* ]]; then - if [ ! -d ~/taichi-llvm-15 ]; then - pushd ~ - if [ -f /etc/centos-release ] ; then - # FIXIME: prebuilt llvm15 on ubuntu didn't work on manylinux image of centos. Once that's fixed, remove this hack. - wget https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip - else - wget https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip + if [ ! -z "$AMDGPU_TEST" ]; then + sudo ln -s /usr/bin/clang++-10 /usr/bin/clang++ + sudo ln -s /usr/bin/clang-10 /usr/bin/clang + sudo ln -s /usr/bin/ld.lld-10 /usr/bin/ld.lld + export LLVM_DIR="/taichi-llvm-15.0.0-linux" + else + if [ ! -d ~/taichi-llvm-15 ]; then + pushd ~ + if [ -f /etc/centos-release ] ; then + # FIXIME: prebuilt llvm15 on ubuntu didn't work on manylinux image of centos. Once that's fixed, remove this hack. + wget https://github.com/ailzhang/torchhub_example/releases/download/0.3/taichi-llvm-15-linux.zip + else + wget https://github.com/taichi-dev/taichi_assets/releases/download/llvm15/taichi-llvm-15-linux.zip + fi + unzip taichi-llvm-15-linux.zip && rm taichi-llvm-15-linux.zip + popd fi - unzip taichi-llvm-15-linux.zip && rm taichi-llvm-15-linux.zip - popd + export LLVM_DIR="$HOME/taichi-llvm-15" fi - export LLVM_DIR="$HOME/taichi-llvm-15" elif [ "$(uname -s):$(uname -m)" == "Darwin:arm64" ]; then # The following commands are done manually to save time. if [ ! -d ~/taichi-llvm-15-m1 ]; then diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh index 64d9b1d36a0ea..a2f41fd2d9d34 100755 --- a/.github/workflows/scripts/unix_test.sh +++ b/.github/workflows/scripts/unix_test.sh @@ -14,6 +14,11 @@ setup_python [[ "$IN_DOCKER" == "true" ]] && cd taichi +if [ ! -z "$AMDGPU_TEST" ]; then + sudo chmod 666 /dev/kfd + sudo chmod 666 /dev/dri/* +fi + python3 -m pip install dist/*.whl if [ -z "$GPU_TEST" ]; then python3 -m pip install -r requirements_test.txt @@ -103,6 +108,9 @@ if [ -z "$GPU_TEST" ]; then fi python3 tests/run_tests.py -vr2 -t4 -k "not paddle" -a "$TI_WANTED_ARCHS" fi +elif [ ! -z "$AMDGPU_TEST" ]; then + run-it cpu $(nproc) + # run-it amdgpu 4 else run-it cuda 8 run-it cpu $(nproc) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 848ec4603649d..a107f7696d96e 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -268,6 +268,74 @@ jobs: path: taichi-release-tests/bad-compare/* retention-days: 7 + build_and_test_amdgpu_linux: + name: Build and Test (AMDGPU) + needs: check_files + timeout-minutes: ${{ github.event.schedule != '0 18 * * *' && 90 || 120 }} + + runs-on: [self-hosted, amdgpu] + + + steps: + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + fetch-depth: '0' + + - name: Prepare Environment + run: | + . .github/workflows/scripts/common-utils.sh + prepare-build-cache + echo CI_DOCKER_RUN_EXTRA_ARGS="-v $(pwd):/home/dev/taichi" >> $GITHUB_ENV + + - name: Build & Install + run: | + [[ ${{needs.check_files.outputs.run_job}} == false ]] && exit 0 + . .github/workflows/scripts/common-utils.sh + + ci-docker-run-amdgpu --name taichi-build \ + registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \ + /home/dev/taichi/.github/workflows/scripts/unix-build.sh + + env: + PY: py38 + PROJECT_NAME: taichi + TAICHI_CMAKE_ARGS: >- + -DTI_WITH_VULKAN:BOOL=OFF + -DTI_BUILD_TESTS:BOOL=ON + -DTI_WITH_CUDA:BOOL=OFF + + - name: Test + id: test + run: | + [[ ${{needs.check_files.outputs.run_job}} == false ]] && exit 0 + . .github/workflows/scripts/common-utils.sh + + ci-docker-run-amdgpu --name taichi-test \ + registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \ + /home/dev/taichi/.github/workflows/scripts/unix_test.sh + env: + PY: py38 + TI_WANTED_ARCHS: 'cpu,amdgpu' + TI_DEVICE_MEMORY_GB: '1' + TI_RUN_RELEASE_TESTS: '0' + + - name: Save wheel if test failed + if: failure() && steps.test.conclusion == 'failure' + uses: actions/upload-artifact@v3 + with: + name: broken-wheel + path: dist/* + retention-days: 7 + + - name: Save Bad Captures + if: failure() && steps.test.conclusion == 'failure' + uses: actions/upload-artifact@v3 + with: + name: bad-captures + path: taichi-release-tests/bad-compare/* + retention-days: 7 + build_and_test_windows: name: Build and Test Windows diff --git a/ci/Dockerfile.ubuntu.18.04.amdgpu b/ci/Dockerfile.ubuntu.18.04.amdgpu index 64aa4d29e53a6..c3eaf51a2371d 100644 --- a/ci/Dockerfile.ubuntu.18.04.amdgpu +++ b/ci/Dockerfile.ubuntu.18.04.amdgpu @@ -47,7 +47,7 @@ RUN apt-get update && \ # Install LLVM 15 WORKDIR / # Make sure this URL gets updated each time there is a new prebuilt bin release -RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.1/taichi-llvm-15.0.0-linux.zip +RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.2/taichi-llvm-15.0.0-linux.zip RUN unzip taichi-llvm-15.0.0-linux.zip && \ rm taichi-llvm-15.0.0-linux.zip ENV PATH="/taichi-llvm-15.0.0-linux/bin:$PATH" @@ -57,9 +57,7 @@ ENV CXX="clang++-10" # Create non-root user for running the container RUN useradd -m -s /bin/bash dev && \ - usermod -a -G video dev && \ - chmod 666 /dev/kfd && \ - chmod 666 /dev/dri/* + usermod -a -G video dev WORKDIR /home/dev USER dev diff --git a/ci/Dockerfile.ubuntu.20.04.amdgpu b/ci/Dockerfile.ubuntu.20.04.amdgpu index 586e7ab9add8b..f8d58dc117107 100644 --- a/ci/Dockerfile.ubuntu.20.04.amdgpu +++ b/ci/Dockerfile.ubuntu.20.04.amdgpu @@ -1,3 +1,6 @@ +// clang++-10 -> clang++ etc +// assets llvm v0.0.1 -> v0.0.2 +// apt install lld-10 # Taichi Dockerfile for development FROM rocm/dev-ubuntu-20.04:5.2 @@ -38,7 +41,7 @@ RUN apt-get update && \ # Install LLVM 15 WORKDIR / # Make sure this URL gets updated each time there is a new prebuilt bin release -RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.1/taichi-llvm-15.0.0-linux.zip +RUN wget https://github.com/GaleSeLee/assets/releases/download/v0.0.2/taichi-llvm-15.0.0-linux.zip RUN unzip taichi-llvm-15.0.0-linux.zip && \ rm taichi-llvm-15.0.0-linux.zip ENV PATH="/taichi-llvm-15.0.0-linux/bin:$PATH"