diff --git a/.github/workflows/ci-extended.yml b/.github/workflows/ci-extended.yml index bfbba99ed1ca..8ca646cfc2eb 100644 --- a/.github/workflows/ci-extended.yml +++ b/.github/workflows/ci-extended.yml @@ -21,6 +21,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 + OMPI_MCA_btl_vader_single_copy_mechanism: none jobs: perf-and-regression: @@ -121,3 +123,59 @@ jobs: example/advection/ascent_render_57.png retention-days: 3 + perf-and-regression-amdgpu: + strategy: + matrix: + parallel: ['serial', 'mpi'] + runs-on: [self-hosted, navi1030] + container: + image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 + # Map to local user id on CI machine to allow writing to build cache and + # forward device handles to access AMD GPU within container + options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined + env: + CMAKE_GENERATOR: Ninja + CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + + - name: Setup cache for gold standard + uses: actions/cache@v3 + with: + path: tst/regression/gold_standard/ + key: gold-standard + + - name: Configure + run: | + cmake -B build \ + -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=hip-${{ matrix.parallel }} \ + -DCMAKE_CXX_COMPILER=hipcc + + - name: Build + run: cmake --build build + + # run performance "unit" tests (none use MPI) + - name: Performance tests + if: ${{ matrix.parallel == 'serial' }} + run: | + cd build + ctest -L performance -LE perf-reg + + # run regression tests + - name: Regression tests + run: | + cd build + ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600 + + - uses: actions/upload-artifact@v3 + with: + name: log-and-convergence-${{ matrix.parallel }} + path: | + build/CMakeFiles/CMakeOutput.log + build/tst/regression/outputs/advection_convergence*/advection-errors.dat + build/tst/regression/outputs/advection_convergence*/advection-errors.png + retention-days: 3 diff --git a/.github/workflows/ci-short.yml b/.github/workflows/ci-short.yml index adbb56287f6e..ecb4052411ee 100644 --- a/.github/workflows/ci-short.yml +++ b/.github/workflows/ci-short.yml @@ -13,6 +13,8 @@ env: CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build MACHINE_CFG: cmake/machinecfg/CI.cmake OMPI_MCA_mpi_common_cuda_event_max: 1000 + # https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231 + OMPI_MCA_btl_vader_single_copy_mechanism: none jobs: style: @@ -130,3 +132,44 @@ jobs: build/profile.txt retention-days: 3 + integration-amdgpu: + runs-on: [self-hosted, navi1030] + container: + image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5 + # Map to local user id on CI machine to allow writing to build cache and + # forward device handles to access AMD GPU within container + options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined + env: + CMAKE_GENERATOR: Ninja + CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build + steps: + - uses: actions/checkout@v3 + with: + submodules: 'true' + - name: Configure + run: | + cmake -B build \ + -DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DMACHINE_VARIANT=hip-mpi \ + -DCMAKE_CXX_COMPILER=hipcc + # Test example with "variables" and output + - name: advection + run: | + cmake --build build -t advection-example + cd build + ctest -R regression_mpi_test:output_hdf5 + # Test example with swarms + - name: particle-leapfrog + run: | + cmake --build build -t particle-leapfrog + cd build + ctest -R regression_mpi_test:particle_leapfrog + + - uses: actions/upload-artifact@v3 + with: + name: configure-log-integration-amdgpu + path: | + build/CMakeFiles/CMakeOutput.log + retention-days: 3 + diff --git a/CHANGELOG.md b/CHANGELOG.md index 976bf37d3454..3e391c6d746d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -57,6 +57,7 @@ - [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR ### Infrastructure (changes irrelevant to downstream codes) +- [[PR 1117]](https://github.com/parthenon-hpc-lab/parthenon/pull/1117) Enable CI pipelines on AMD GPUs with ROCM/HIP - [[PR 1114]](https://github.com/parthenon-hpc-lab/parthenon/pull/1114) Enable sanitizers for extended CI host build - [[PR 1123]](https://github.com/parthenon-hpc-lab/parthenon/pull/1123) Default initialize ProResInfo.dir - [[PR 1121]](https://github.com/parthenon-hpc-lab/parthenon/pull/1121) Default initialize BndInfo.dir diff --git a/cmake/TestSetup.cmake b/cmake/TestSetup.cmake index dd2f8b05ec4b..005756d3ade5 100644 --- a/cmake/TestSetup.cmake +++ b/cmake/TestSetup.cmake @@ -152,7 +152,7 @@ function(setup_test_parallel nproc dir arg extra_labels) list(APPEND labels "${extra_labels}") if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP) - set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-num-devices=${NUM_GPU_DEVICES_PER_NODE}") + set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-map-device-id-by=mpi_rank") list(APPEND labels "cuda") endif() if (Kokkos_ENABLE_OPENMP) diff --git a/cmake/machinecfg/GitHubActions.cmake b/cmake/machinecfg/GitHubActions.cmake index fc91643a1a74..663dcb38d682 100644 --- a/cmake/machinecfg/GitHubActions.cmake +++ b/cmake/machinecfg/GitHubActions.cmake @@ -29,9 +29,10 @@ if (${MACHINE_VARIANT} MATCHES "cuda") set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -Wno-unknown-cuda-version") endif() elseif (${MACHINE_VARIANT} MATCHES "hip") - # using an arbitrary arch as GitHub Action runners don't have GPUs - set(Kokkos_ARCH_VEGA908 ON CACHE BOOL "GPU architecture") + # using an arch that matches Hamilton at Hamburg Obs + set(Kokkos_ARCH_NAVI1030 ON CACHE BOOL "GPU architecture") set(Kokkos_ENABLE_HIP ON CACHE BOOL "Enable HIP") + set(Kokkos_ENABLE_ZEN3 ON CACHE BOOL "Enable Zen3") else() set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -fopenmp-simd") endif() diff --git a/scripts/docker/Dockerfile.hip-rocm b/scripts/docker/Dockerfile.hip-rocm index f586ade42104..5d9d5c765b6a 100644 --- a/scripts/docker/Dockerfile.hip-rocm +++ b/scripts/docker/Dockerfile.hip-rocm @@ -20,3 +20,6 @@ RUN cd /tmp && \ ENV LDFLAGS="-lopen-pal" RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 + +# uid 1000 maps to the one running the container on the CI host +RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci diff --git a/tst/regression/utils/test_case.py b/tst/regression/utils/test_case.py index 12302568fbd5..7b09d00358d5 100644 --- a/tst/regression/utils/test_case.py +++ b/tst/regression/utils/test_case.py @@ -89,9 +89,7 @@ def __init__(self, run_test_path, **kwargs): try: parthenon_path = os.path.realpath(__file__) idx = parthenon_path.rindex("/parthenon/") - self.parameters.parthenon_path = os.path.join( - parthenon_path[:idx], "parthenon" - ) + self.parameters.parthenon_path = parthenon_path[: idx + 10] except ValueError: baseDir = os.path.dirname(__file__) self.parameters.parthenon_path = os.path.abspath(baseDir + "/../../../")