From 2a60142ac70071e919617156f814eda3d209775f Mon Sep 17 00:00:00 2001 From: Zeyu Li Date: Fri, 30 Dec 2022 20:52:31 +0800 Subject: [PATCH] [amdgpu] Part0 add render hardware interface (#6464) Issue: #https://github.com/taichi-dev/taichi/issues/6434 ### Brief Summary It contains four parts(`driver`, `context`, `device` and `caching_allocator`). The code is similar to `cuda/rhi`. However, there are still some differences between `amdgpu/rhi` and `cuda/rhi` #### context 1. The method of obtaining the hardware version 2. Context::launch #### driver 1. ROCm/hip internal functions #### cmake The current cmake compilation system is sufficient to support the Unit test in https://github.com/taichi-dev/taichi/pull/6597 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/testing.yml | 5 +- CMakeLists.txt | 4 + cmake/TaichiCore.cmake | 25 ++++ taichi/rhi/amdgpu/CMakeLists.txt | 21 +++ .../rhi/amdgpu/amdgpu_caching_allocator.cpp | 40 ++++++ taichi/rhi/amdgpu/amdgpu_caching_allocator.h | 28 ++++ taichi/rhi/amdgpu/amdgpu_context.cpp | 93 ++++++++++++ taichi/rhi/amdgpu/amdgpu_context.h | 99 +++++++++++++ taichi/rhi/amdgpu/amdgpu_device.cpp | 134 ++++++++++++++++++ taichi/rhi/amdgpu/amdgpu_device.h | 134 ++++++++++++++++++ taichi/rhi/amdgpu/amdgpu_driver.cpp | 82 +++++++++++ taichi/rhi/amdgpu/amdgpu_driver.h | 119 ++++++++++++++++ .../rhi/amdgpu/amdgpu_driver_functions.inc.h | 127 +++++++++++++++++ taichi/runtime/llvm/CMakeLists.txt | 5 + taichi/runtime/llvm/llvm_runtime_executor.cpp | 8 +- 15 files changed, 921 insertions(+), 3 deletions(-) create mode 100644 taichi/rhi/amdgpu/CMakeLists.txt create mode 100644 taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_caching_allocator.h create mode 100644 taichi/rhi/amdgpu/amdgpu_context.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_context.h create mode 100644 taichi/rhi/amdgpu/amdgpu_device.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_device.h create mode 100644 taichi/rhi/amdgpu/amdgpu_driver.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_driver.h create mode 100644 taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 9500bf2544bbf..90a482932247c 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -291,7 +291,7 @@ jobs: . .github/workflows/scripts/common-utils.sh ci-docker-run-amdgpu --name taichi-build \ - registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \ + registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \ /home/dev/taichi/.github/workflows/scripts/build.py env: @@ -302,6 +302,7 @@ jobs: -DTI_WITH_VULKAN:BOOL=OFF -DTI_WITH_OPENGL:BOOL=OFF -DTI_BUILD_TESTS:BOOL=ON + -DTI_WITH_AMDGPU:BOOL=ON - name: Test id: test @@ -310,7 +311,7 @@ jobs: . .github/workflows/scripts/common-utils.sh ci-docker-run-amdgpu --name taichi-test \ - registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \ + registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \ /home/dev/taichi/.github/workflows/scripts/unix_test.sh env: PY: '3.8' diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a220b8570364..822c604f4baa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,6 +181,10 @@ if (TI_WITH_CUDA) set(CUDA_ARCH "cuda") endif() +if (TI_WITH_AMDGPU) + set(AMDGPU_ARCH "amdgpu") +endif() + if (TI_WITH_DX12) set(DX12_ARCH "dx12") endif() diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index 241d8df91922a..5b715d4bd3def 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -3,6 +3,7 @@ option(TI_WITH_LLVM "Build with LLVM backends" ON) option(TI_WITH_METAL "Build with the Metal backend" ON) option(TI_WITH_CUDA "Build with the CUDA backend" ON) option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF) +option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF) option(TI_WITH_OPENGL "Build with the OpenGL backend" ON) option(TI_WITH_CC "Build with the C backend" ON) option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF) @@ -34,6 +35,10 @@ if(ANDROID) set(TI_WITH_DX12 OFF) endif() +if (TI_WITH_AMDGPU AND TI_WITH_CUDA) + message(WARNING "Compiling CUDA and AMDGPU backends simultaneously") +endif() + if(UNIX AND NOT APPLE) # Handy helper for Linux # https://stackoverflow.com/a/32259072/12003165 @@ -53,6 +58,10 @@ if (APPLE) set(TI_WITH_CC OFF) message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.") endif() + if (TI_WITH_AMDGPU) + set(TI_WITH_AMDGPU OFF) + message(WARNING "AMDGPU backend not supported on OS X. Setting TI_WITH_AMDGPU to OFF.") + endif() endif() if (WIN32) @@ -60,6 +69,10 @@ if (WIN32) set(TI_WITH_CC OFF) message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.") endif() + if (TI_WITH_AMDGPU) + set(TI_WITH_AMDGPU OFF) + message(WARNING "AMDGPU backend not supported on Windows. Setting TI_WITH_AMDGPU to OFF.") + endif() endif() if(TI_WITH_VULKAN) @@ -108,6 +121,12 @@ if (TI_WITH_CUDA) list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CUDA_RUNTIME_SOURCE}) endif() +if (TI_WITH_AMDGPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU") +# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp") + list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE}) +endif() + if (TI_WITH_DX12) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_DX12") endif() @@ -215,6 +234,12 @@ if(TI_WITH_LLVM) target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cuda_rhi) endif() + if (TI_WITH_AMDGPU) + llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU) + add_subdirectory(taichi/rhi/amdgpu) + target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi) + endif() + if (TI_WITH_DX12) llvm_map_components_to_libnames(llvm_directx_libs DirectX) diff --git a/taichi/rhi/amdgpu/CMakeLists.txt b/taichi/rhi/amdgpu/CMakeLists.txt new file mode 100644 index 0000000000000..8c6e42417bb13 --- /dev/null +++ b/taichi/rhi/amdgpu/CMakeLists.txt @@ -0,0 +1,21 @@ +# ./taichi/rhi/amdgpu/CMakeLists.txt + +set(AMDGPU_RHI amdgpu_rhi) +add_library(${AMDGPU_RHI}) +target_sources(${AMDGPU_RHI} + PRIVATE + amdgpu_device.cpp + amdgpu_caching_allocator.cpp + amdgpu_context.cpp + amdgpu_driver.cpp + ) + +target_include_directories(${AMDGPU_RHI} + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/external/eigen + ${PROJECT_SOURCE_DIR}/external/spdlog/include + ${LLVM_INCLUDE_DIRS} + ) + +target_link_libraries(${AMDGPU_RHI} PRIVATE interop_rhi) diff --git a/taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp b/taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp new file mode 100644 index 0000000000000..4e6418e96518e --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp @@ -0,0 +1,40 @@ +#include "taichi/rhi/amdgpu/amdgpu_caching_allocator.h" + +namespace taichi { +namespace lang { +namespace amdgpu { + +AmdgpuCachingAllocator::AmdgpuCachingAllocator(LlvmDevice *device) + : device_(device) { +} + +uint64_t *AmdgpuCachingAllocator::allocate( + const LlvmDevice::LlvmRuntimeAllocParams ¶ms) { + uint64_t *ret{nullptr}; + auto size_aligned = taichi::iroundup(params.size, taichi_page_size); + auto it_blk = mem_blocks_.lower_bound(size_aligned); + + if (it_blk != mem_blocks_.end()) { + size_t remaining_sz = it_blk->first - size_aligned; + if (remaining_sz > 0) { + TI_ASSERT(remaining_sz % taichi_page_size == 0); + auto remaining_head = + reinterpret_cast(it_blk->second) + size_aligned; + mem_blocks_.insert( + {remaining_sz, reinterpret_cast(remaining_head)}); + } + ret = it_blk->second; + mem_blocks_.erase(it_blk); + } else { + ret = device_->allocate_llvm_runtime_memory_jit(params); + } + return ret; +} + +void AmdgpuCachingAllocator::release(size_t sz, uint64_t *ptr) { + mem_blocks_.insert({sz, ptr}); +} + +} // namespace amdgpu +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_caching_allocator.h b/taichi/rhi/amdgpu/amdgpu_caching_allocator.h new file mode 100644 index 0000000000000..bebcefdf16324 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_caching_allocator.h @@ -0,0 +1,28 @@ +#pragma once + +#include "taichi/common/core.h" +#include "taichi/math/arithmetic.h" +#include "taichi/rhi/llvm/llvm_device.h" +#include "taichi/inc/constants.h" +#include +#include + +namespace taichi { +namespace lang { +namespace amdgpu { + +class AmdgpuCachingAllocator { + public: + AmdgpuCachingAllocator(LlvmDevice *device); + + uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams ¶ms); + void release(size_t sz, uint64_t *ptr); + + private: + std::multimap mem_blocks_; + LlvmDevice *device_{nullptr}; +}; + +} // namespace amdgpu +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_context.cpp b/taichi/rhi/amdgpu/amdgpu_context.cpp new file mode 100644 index 0000000000000..04fb173a0e73b --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_context.cpp @@ -0,0 +1,93 @@ +#define TI_RUNTIME_HOST +#include "amdgpu_context.h" + +#include +#include + +#include "taichi/util/lang_util.h" +#include "taichi/program/program.h" +#include "taichi/system/threading.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/analysis/offline_cache_util.h" + +namespace taichi { +namespace lang { + +AMDGPUContext::AMDGPUContext() + : driver_(AMDGPUDriver::get_instance_without_context()) { + dev_count_ = 0; + driver_.init(0); + driver_.device_get_count(&dev_count_); + driver_.device_get(&device_, 0); + + char name[128]; + driver_.device_get_name(name, 128, device_); + + TI_TRACE("Using AMDGPU device [id=0]: {}", name); + + driver_.context_create(&context_, 0, device_); + + const auto GB = std::pow(1024.0, 3.0); + TI_TRACE("Total memory {:.2f} GB; free memory {:.2f} GB", + get_total_memory() / GB, get_free_memory() / GB); + + void *hip_device_prop = std::malloc(HIP_DEVICE_PROPERTIES_STRUCT_SIZE); + driver_.device_get_prop(hip_device_prop, device_); + compute_capability_ = *((int *)hip_device_prop + HIP_DEVICE_GCN_ARCH); + std::free(hip_device_prop); + + mcpu_ = fmt::format("gfx{}", compute_capability_); + + TI_TRACE("Emitting AMDGPU code for {}", mcpu_); +} + +std::size_t AMDGPUContext::get_total_memory() { + std::size_t ret, _; + driver_.mem_get_info(&_, &ret); + return ret; +} + +std::size_t AMDGPUContext::get_free_memory() { + std::size_t ret, _; + driver_.mem_get_info(&ret, &_); + return ret; +} + +std::string AMDGPUContext::get_device_name() { + constexpr uint32_t kMaxNameStringLength = 128; + char name[kMaxNameStringLength]; + driver_.device_get_name(name, kMaxNameStringLength /*=128*/, device_); + std::string str(name); + return str; +} + +void AMDGPUContext::launch(void *func, + const std::string &task_name, + void *arg_pointers, + unsigned grid_dim, + unsigned block_dim, + std::size_t dynamic_shared_mem_bytes, + int arg_bytes) { + if (grid_dim > 0) { + std::lock_guard _(lock_); + void *config[] = {(void *)0x01, const_cast(arg_pointers), + (void *)0x02, &arg_bytes, (void *)0x03}; + driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1, + dynamic_shared_mem_bytes, nullptr, nullptr, + reinterpret_cast(&config)); + } + if (debug_) { + driver_.stream_synchronize(nullptr); + } +} + +AMDGPUContext::~AMDGPUContext() { +} + +AMDGPUContext &AMDGPUContext::get_instance() { + static auto context = new AMDGPUContext(); + return *context; +} + +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h new file mode 100644 index 0000000000000..7e182e07ea3d7 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_context.h @@ -0,0 +1,99 @@ +#pragma once + +#include +#include +#include + +#include "taichi/program/kernel_profiler.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" + +namespace taichi { +namespace lang { + +class AMDGPUDriver; + +class AMDGPUContext { + private: + void *device_; + void *context_; + int dev_count_; + int compute_capability_; + std::string mcpu_; + std::mutex lock_; + AMDGPUDriver &driver_; + bool debug_; + + public: + AMDGPUContext(); + + std::size_t get_total_memory(); + std::size_t get_free_memory(); + std::string get_device_name(); + + bool detected() const { + return dev_count_ != 0; + } + + void launch(void *func, + const std::string &task_name, + void *arg_pointers, + unsigned grid_dim, + unsigned block_dim, + std::size_t dynamic_shared_mem_bytes, + int arg_bytes); + + void set_debug(bool debug) { + debug_ = debug; + } + + std::string get_mcpu() const { + return mcpu_; + } + + void *get_context() { + return context_; + } + + void make_current() { + driver_.context_set_current(context_); + } + + int get_compute_capability() const { + return compute_capability_; + } + + ~AMDGPUContext(); + + class ContextGuard { + private: + void *old_ctx_; + void *new_ctx_; + + public: + ContextGuard(AMDGPUContext *new_ctx) + : old_ctx_(nullptr), new_ctx_(new_ctx) { + AMDGPUDriver::get_instance().context_get_current(&old_ctx_); + if (old_ctx_ != new_ctx) + new_ctx->make_current(); + } + + ~ContextGuard() { + if (old_ctx_ != new_ctx_) { + AMDGPUDriver::get_instance().context_set_current(old_ctx_); + } + } + }; + + ContextGuard get_guard() { + return ContextGuard(this); + } + + std::unique_lock get_lock_guard() { + return std::unique_lock(lock_); + } + + static AMDGPUContext &get_instance(); +}; + +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_device.cpp b/taichi/rhi/amdgpu/amdgpu_device.cpp new file mode 100644 index 0000000000000..75df5cdd5a598 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_device.cpp @@ -0,0 +1,134 @@ +#include "taichi/rhi/amdgpu/amdgpu_device.h" + +namespace taichi { +namespace lang { + +namespace amdgpu { + +AmdgpuDevice::AllocInfo AmdgpuDevice::get_alloc_info( + const DeviceAllocation handle) { + validate_device_alloc(handle); + return allocations_[handle.alloc_id]; +} + +DeviceAllocation AmdgpuDevice::allocate_memory(const AllocParams ¶ms) { + AllocInfo info; + + if (params.host_read || params.host_write) { + AMDGPUDriver::get_instance().malloc_managed(&info.ptr, params.size, + HIP_MEM_ATTACH_GLOBAL); + } else { + AMDGPUDriver::get_instance().malloc(&info.ptr, params.size); + } + + info.size = params.size; + info.is_imported = false; + info.use_cached = false; + info.use_preallocated = false; + + DeviceAllocation alloc; + alloc.alloc_id = allocations_.size(); + alloc.device = this; + + allocations_.push_back(info); + return alloc; +} + +DeviceAllocation AmdgpuDevice::allocate_memory_runtime( + const LlvmRuntimeAllocParams ¶ms) { + AllocInfo info; + info.size = taichi::iroundup(params.size, taichi_page_size); + if (params.host_read || params.host_write) { + TI_NOT_IMPLEMENTED + } else if (params.use_cached) { + if (caching_allocator_ == nullptr) { + caching_allocator_ = std::make_unique(this); + } + info.ptr = caching_allocator_->allocate(params); + AMDGPUDriver::get_instance().memset((void *)info.ptr, 0, info.size); + } else { + info.ptr = allocate_llvm_runtime_memory_jit(params); + } + info.is_imported = false; + info.use_cached = params.use_cached; + info.use_preallocated = true; + + DeviceAllocation alloc; + alloc.alloc_id = allocations_.size(); + alloc.device = this; + + allocations_.push_back(info); + return alloc; +} + +void AmdgpuDevice::dealloc_memory(DeviceAllocation handle) { + validate_device_alloc(handle); + AllocInfo &info = allocations_[handle.alloc_id]; + if (info.ptr == nullptr) { + TI_ERROR("the DeviceAllocation is already deallocated"); + } + TI_ASSERT(!info.is_imported); + if (info.use_cached) { + if (caching_allocator_ == nullptr) { + TI_ERROR("the AmdgpuCachingAllocator is not initialized"); + } + caching_allocator_->release(info.size, (uint64_t *)info.ptr); + } else if (!info.use_preallocated) { + AMDGPUDriver::get_instance().mem_free(info.ptr); + info.ptr = nullptr; + } +} + +RhiResult AmdgpuDevice::map(DeviceAllocation alloc, void **mapped_ptr) { + AllocInfo &info = allocations_[alloc.alloc_id]; + size_t size = info.size; + info.mapped = new char[size]; + // FIXME: there should be a better way to do this... + AMDGPUDriver::get_instance().memcpy_device_to_host(info.mapped, info.ptr, + size); + *mapped_ptr = info.mapped; + return RhiResult::success; +} + +void AmdgpuDevice::unmap(DeviceAllocation alloc) { + AllocInfo &info = allocations_[alloc.alloc_id]; + AMDGPUDriver::get_instance().memcpy_host_to_device(info.ptr, info.mapped, + info.size); + delete[] static_cast(info.mapped); + return; +} + +void AmdgpuDevice::memcpy_internal(DevicePtr dst, + DevicePtr src, + uint64_t size) { + void *dst_ptr = + static_cast(allocations_[dst.alloc_id].ptr) + dst.offset; + void *src_ptr = + static_cast(allocations_[src.alloc_id].ptr) + src.offset; + AMDGPUDriver::get_instance().memcpy_device_to_device(dst_ptr, src_ptr, size); +} + +DeviceAllocation AmdgpuDevice::import_memory(void *ptr, size_t size) { + AllocInfo info; + info.ptr = ptr; + info.size = size; + info.is_imported = true; + + DeviceAllocation alloc; + alloc.alloc_id = allocations_.size(); + alloc.device = this; + + allocations_.push_back(info); + return alloc; +} + +uint64 AmdgpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) { + AMDGPUDriver::get_instance().stream_synchronize(nullptr); + uint64 ret; + AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i, + sizeof(uint64)); + return ret; +} +} // namespace amdgpu +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_device.h b/taichi/rhi/amdgpu/amdgpu_device.h new file mode 100644 index 0000000000000..45edd539e87dd --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_device.h @@ -0,0 +1,134 @@ +#pragma once +#include +#include + +#include "taichi/common/core.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_caching_allocator.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#include "taichi/rhi/llvm/llvm_device.h" + +namespace taichi { +namespace lang { +namespace amdgpu { + +class AmdgpuResourceBinder : public ResourceBinder { + public: + ~AmdgpuResourceBinder() override { + } + + void rw_buffer(uint32_t set, + uint32_t binding, + DevicePtr ptr, + size_t size) override{TI_NOT_IMPLEMENTED}; + void rw_buffer(uint32_t set, + uint32_t binding, + DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED}; + + void buffer(uint32_t set, + uint32_t binding, + DevicePtr ptr, + size_t size) override{TI_NOT_IMPLEMENTED}; + void buffer(uint32_t set, uint32_t binding, DeviceAllocation alloc) override{ + TI_NOT_IMPLEMENTED}; +}; + +class AmdgpuPipeline : public Pipeline { + public: + ~AmdgpuPipeline() override { + } + + ResourceBinder *resource_binder() override{TI_NOT_IMPLEMENTED}; +}; + +class AmdgpuCommandList : public CommandList { + public: + ~AmdgpuCommandList() override { + } + + void bind_pipeline(Pipeline *p) override{TI_NOT_IMPLEMENTED}; + void bind_resources(ResourceBinder *binder) override{TI_NOT_IMPLEMENTED}; + void buffer_barrier(DevicePtr ptr, size_t size) override{TI_NOT_IMPLEMENTED}; + void buffer_barrier(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED}; + void memory_barrier() override{TI_NOT_IMPLEMENTED}; + void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override{ + TI_NOT_IMPLEMENTED}; + void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override{ + TI_NOT_IMPLEMENTED}; + void dispatch(uint32_t x, uint32_t y = 1, uint32_t z = 1) override{ + TI_NOT_IMPLEMENTED}; +}; + +class AmdgpuStream : public Stream { + public: + ~AmdgpuStream() override{}; + + std::unique_ptr new_command_list() override{TI_NOT_IMPLEMENTED}; + StreamSemaphore submit(CommandList *cmdlist, + const std::vector &wait_semaphores = + {}) override{TI_NOT_IMPLEMENTED}; + StreamSemaphore submit_synced( + CommandList *cmdlist, + const std::vector &wait_semaphores = {}) override{ + TI_NOT_IMPLEMENTED}; + + void command_sync() override{TI_NOT_IMPLEMENTED}; +}; + +class AmdgpuDevice : public LlvmDevice { + public: + struct AllocInfo { + void *ptr{nullptr}; + size_t size{0}; + bool is_imported{false}; + bool use_preallocated{true}; + bool use_cached{false}; + void *mapped{nullptr}; + }; + + AllocInfo get_alloc_info(const DeviceAllocation handle); + + ~AmdgpuDevice() override{}; + + DeviceAllocation allocate_memory(const AllocParams ¶ms) override; + DeviceAllocation allocate_memory_runtime( + const LlvmRuntimeAllocParams ¶ms) override; + void dealloc_memory(DeviceAllocation handle) override; + + std::unique_ptr create_pipeline( + const PipelineSourceDesc &src, + std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED}; + + uint64 fetch_result_uint64(int i, uint64 *result_buffer) override; + + RhiResult map_range(DevicePtr ptr, uint64_t size, void **mapped_ptr) final { + TI_NOT_IMPLEMENTED; + } + RhiResult map(DeviceAllocation alloc, void **mapped_ptr) final; + + void unmap(DevicePtr ptr) override{TI_NOT_IMPLEMENTED}; + void unmap(DeviceAllocation alloc) override; + + void memcpy_internal(DevicePtr dst, DevicePtr src, uint64_t size) override; + + DeviceAllocation import_memory(void *ptr, size_t size); + + Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED}; + + void wait_idle() override{TI_NOT_IMPLEMENTED}; + + private: + std::vector allocations_; + void validate_device_alloc(const DeviceAllocation alloc) { + if (allocations_.size() <= alloc.alloc_id) { + TI_ERROR("invalid DeviceAllocation"); + } + } + std::unique_ptr caching_allocator_{nullptr}; +}; + +} // namespace amdgpu + +} // namespace lang + +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_driver.cpp b/taichi/rhi/amdgpu/amdgpu_driver.cpp new file mode 100644 index 0000000000000..ee47a481cea74 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_driver.cpp @@ -0,0 +1,82 @@ +#include "taichi/rhi/amdgpu/amdgpu_driver.h" + +#include "taichi/system/dynamic_loader.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#include "taichi/util/environ_config.h" + +namespace taichi { +namespace lang { + +std::string get_amdgpu_error_message(uint32 err) { + auto err_name_ptr = + AMDGPUDriver::get_instance_without_context().get_error_name(err); + auto err_string_ptr = + AMDGPUDriver::get_instance_without_context().get_error_string(err); + return fmt::format("AMDGPU Error {}: {}", err_name_ptr, err_string_ptr); +} + +AMDGPUDriverBase::AMDGPUDriverBase() { + disabled_by_env_ = (get_environ_config("TI_ENABLE_AMDGPU", 1) == 0); + if (disabled_by_env_) { + TI_TRACE( + "AMDGPU driver disabled by enviroment variable \"TI_ENABLE_AMDGPU\"."); + } +} + +bool AMDGPUDriverBase::load_lib(std::string lib_linux) { +#if defined(TI_PLATFORM_LINUX) + auto lib_name = lib_linux; +#else + static_assert(false, "Taichi AMDGPU driver supports only Linux."); +#endif + + loader_ = std::make_unique(lib_name); + if (!loader_->loaded()) { + TI_WARN("{} lib not found.", lib_name); + return false; + } else { + TI_TRACE("{} loaded!", lib_name); + return true; + } +} + +bool AMDGPUDriver::detected() { + return !disabled_by_env_ && loader_->loaded(); +} + +AMDGPUDriver::AMDGPUDriver() { + if (!load_lib("libamdhip64.so")) + return; + + loader_->load_function("hipGetErrorName", get_error_name); + loader_->load_function("hipGetErrorString", get_error_string); + loader_->load_function("hipDriverGetVersion", driver_get_version); + + int version; + driver_get_version(&version); + TI_TRACE("AMDGPU driver API (v{}.{}) loaded.", version / 1000, + version % 1000 / 10); + +#define PER_AMDGPU_FUNCTION(name, symbol_name, ...) \ + name.set(loader_->load_function(#symbol_name)); \ + name.set_lock(&lock_); \ + name.set_names(#name, #symbol_name); +#include "taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h" +#undef PER_AMDGPU_FUNCTION +} + +AMDGPUDriver &AMDGPUDriver::get_instance_without_context() { + // Thread safety guaranteed by C++ compiler + // Note this is never deleted until the process finishes + static AMDGPUDriver *instance = new AMDGPUDriver(); + return *instance; +} + +AMDGPUDriver &AMDGPUDriver::get_instance() { + // initialize the AMDGPU context so that the driver APIs can be called later + AMDGPUContext::get_instance(); + return get_instance_without_context(); +} + +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_driver.h b/taichi/rhi/amdgpu/amdgpu_driver.h new file mode 100644 index 0000000000000..85f8fc77d0030 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_driver.h @@ -0,0 +1,119 @@ +#pragma once + +#include + +#include "taichi/system/dynamic_loader.h" + +namespace taichi { +namespace lang { + +constexpr uint32 HIP_EVENT_DEFAULT = 0x0; +constexpr uint32 HIP_STREAM_DEFAULT = 0x0; +constexpr uint32 HIP_STREAM_NON_BLOCKING = 0x1; +constexpr uint32 HIP_MEM_ATTACH_GLOBAL = 0x1; +constexpr uint32 HIP_MEM_ADVISE_SET_PREFERRED_LOCATION = 3; +constexpr uint32 HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 26; +constexpr uint32 HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 63; +constexpr uint32 HIP_DEVICE_PROPERTIES_STRUCT_SIZE = 792; +constexpr uint32 HIP_DEVICE_GCN_ARCH = 98; +constexpr uint32 HIP_ERROR_ASSERT = 710; +constexpr uint32 HIP_JIT_MAX_REGISTERS = 0; +constexpr uint32 HIP_POINTER_ATTRIBUTE_MEMORY_TYPE = 2; +constexpr uint32 HIP_SUCCESS = 0; +constexpr uint32 HIP_MEMORYTYPE_DEVICE = 1; + +std::string get_amdgpu_error_message(uint32 err); + +template +class AMDGPUFunction { + public: + AMDGPUFunction() { + function_ = nullptr; + } + + void set(void *func_ptr) { + function_ = (func_type *)func_ptr; + } + + uint32 call(Args... args) { + TI_ASSERT(function_ != nullptr); + TI_ASSERT(driver_lock_ != nullptr); + std::lock_guard _(*driver_lock_); + return (uint32)function_(args...); + } + + void set_names(const std::string &name, const std::string &symbol_name) { + name_ = name; + symbol_name_ = symbol_name; + } + + void set_lock(std::mutex *lock) { + driver_lock_ = lock; + } + + std::string get_error_message(uint32 err) { + return get_amdgpu_error_message(err) + + fmt::format(" while calling {} ({})", name_, symbol_name_); + } + + uint32 call_with_warning(Args... args) { + auto err = call(args...); + TI_WARN_IF(err, "{}", get_error_message(err)); + return err; + } + + void operator()(Args... args) { + auto err = call(args...); + TI_ERROR_IF(err, get_error_message(err)); + } + + private: + using func_type = uint32_t(Args...); + + func_type *function_{nullptr}; + std::string name_, symbol_name_; + std::mutex *driver_lock_{nullptr}; +}; + +class AMDGPUDriverBase { + public: + ~AMDGPUDriverBase() = default; + + protected: + std::unique_ptr loader_; + AMDGPUDriverBase(); + + bool load_lib(std::string lib_linux); + + bool disabled_by_env_{false}; +}; + +class AMDGPUDriver : protected AMDGPUDriverBase { + public: +#define PER_AMDGPU_FUNCTION(name, symbol_name, ...) \ + AMDGPUFunction<__VA_ARGS__> name; +#include "taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h" +#undef PER_AMDGPU_FUNCTION + + char (*get_error_name)(uint32); + + char (*get_error_string)(uint32); + + void (*driver_get_version)(int *); + + bool detected(); + + static AMDGPUDriver &get_instance(); + + static AMDGPUDriver &get_instance_without_context(); + + private: + AMDGPUDriver(); + + std::mutex lock_; + + // bool rocm_version_valid_{false}; +}; + +} // namespace lang +} // namespace taichi diff --git a/taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h b/taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h new file mode 100644 index 0000000000000..a4f0e6fdea1c3 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_driver_functions.inc.h @@ -0,0 +1,127 @@ +// Init +PER_AMDGPU_FUNCTION(init, hipInit, unsigned int); + +// Device management +PER_AMDGPU_FUNCTION(device_get_count, hipGetDeviceCount, int *); +PER_AMDGPU_FUNCTION(device_get_attribute, + hipDeviceGetAttribute, + int *, + uint32, + int); +PER_AMDGPU_FUNCTION(device_get_prop, hipGetDeviceProperties, void *, void *); +PER_AMDGPU_FUNCTION(device_get_name, hipDeviceGetName, char *, int, void *); +PER_AMDGPU_FUNCTION(device_get, hipDeviceGet, void *, void *); + +// Context management +PER_AMDGPU_FUNCTION(context_create, hipCtxCreate, void *, int, void *); +PER_AMDGPU_FUNCTION(context_set_current, hipCtxSetCurrent, void *); +PER_AMDGPU_FUNCTION(context_get_current, hipCtxGetCurrent, void **); + +// Stream management +PER_AMDGPU_FUNCTION(stream_create, hipStreamCreate, void **, uint32); + +// Memory management +PER_AMDGPU_FUNCTION(memcpy_host_to_device, + hipMemcpyHtoD, + void *, + void *, + std::size_t); +PER_AMDGPU_FUNCTION(memcpy_device_to_host, + hipMemcpyDtoH, + void *, + void *, + std::size_t); +PER_AMDGPU_FUNCTION(memcpy_device_to_device, + hipMemcpyDtoD, + void *, + void *, + std::size_t); +PER_AMDGPU_FUNCTION(memcpy, + hipMemcpy, + void *, + void *, + std::size_t, + unsigned int); +PER_AMDGPU_FUNCTION(memcpy_async, + hipMemcpyAsync, + void *, + void *, + std::size_t, + unsigned int, + void *); +PER_AMDGPU_FUNCTION(memcpy_host_to_device_async, + hipMemcpyHtoDAsync, + void *, + void *, + std::size_t, + void *); +PER_AMDGPU_FUNCTION(memcpy_device_to_host_async, + hipMemcpyDtoHAsync, + void *, + void *, + std::size_t, + void *); +PER_AMDGPU_FUNCTION(malloc, hipMalloc, void **, std::size_t); +PER_AMDGPU_FUNCTION(malloc_managed, + hipMallocManaged, + void **, + std::size_t, + uint32); +PER_AMDGPU_FUNCTION(memset, hipMemset, void *, uint8, std::size_t); +PER_AMDGPU_FUNCTION(mem_free, hipFree, void *); +PER_AMDGPU_FUNCTION(mem_get_info, hipMemGetInfo, std::size_t *, std::size_t *); +PER_AMDGPU_FUNCTION(mem_get_attribute, + hipPointerGetAttribute, + void *, + uint32, + void *); +PER_AMDGPU_FUNCTION(mem_get_attributes, + hipPointerGetAttributes, + void *, + void *); + +// Module and kernels +PER_AMDGPU_FUNCTION(module_get_function, + hipModuleGetFunction, + void **, + void *, + const char *); +PER_AMDGPU_FUNCTION(module_load_data, hipModuleLoadData, void **, const void *); +PER_AMDGPU_FUNCTION(launch_kernel, + hipModuleLaunchKernel, + void *, + uint32, + uint32, + uint32, + uint32, + uint32, + uint32, + uint32, + void *, + void **, + void **); +PER_AMDGPU_FUNCTION(kernel_get_attribute, + hipFuncGetAttribute, + int *, + uint32, + void *); +PER_AMDGPU_FUNCTION(kernel_get_occupancy, + hipOccupancyMaxActiveBlocksPerMultiprocessor, + int *, + void *, + int, + size_t); + +// Stream management +PER_AMDGPU_FUNCTION(stream_synchronize, hipStreamSynchronize, void *); + +// Event management +PER_AMDGPU_FUNCTION(event_create, hipEventCreateWithFlags, void **, uint32); +PER_AMDGPU_FUNCTION(event_destroy, hipEventDestroy, void *); +PER_AMDGPU_FUNCTION(event_record, hipEventRecord, void *, void *); +PER_AMDGPU_FUNCTION(event_synchronize, hipEventSynchronize, void *); +PER_AMDGPU_FUNCTION(event_elapsed_time, + hipEventElapsedTime, + float *, + void *, + void *); diff --git a/taichi/runtime/llvm/CMakeLists.txt b/taichi/runtime/llvm/CMakeLists.txt index bd8448d63f86a..2c39493415ea2 100644 --- a/taichi/runtime/llvm/CMakeLists.txt +++ b/taichi/runtime/llvm/CMakeLists.txt @@ -33,6 +33,11 @@ if (TI_WITH_CUDA) target_link_libraries(llvm_runtime PRIVATE cuda_rhi) endif() +if (TI_WITH_AMDGPU) + target_link_libraries(llvm_runtime PRIVATE ${llvm_ptx_libs}) + target_link_libraries(llvm_runtime PRIVATE amdgpu_rhi) +endif() + if (TI_WITH_DX12) target_link_libraries(llvm_runtime PRIVATE ${llvm_directx_libs}) target_link_libraries(llvm_runtime PRIVATE dx12_rhi) diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 15968f299f626..b137aba54f632 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -6,11 +6,17 @@ #include "taichi/rhi/cuda/cuda_device.h" #include "taichi/platform/cuda/detect_cuda.h" #include "taichi/rhi/cuda/cuda_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_device.h" #if defined(TI_WITH_CUDA) #include "taichi/rhi/cuda/cuda_context.h" #endif +#if defined(TI_WITH_AMDGPU) +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#endif + namespace taichi::lang { namespace { void assert_failed_host(const char *msg) { @@ -118,7 +124,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, #if defined(TI_WITH_AMDGPU) if (config.arch == Arch::amdgpu) { AMDGPUContext::get_instance().set_debug(config.debug); - device_ = std::make_shared(); + device_ = std::make_shared(); this->maybe_initialize_amdgpu_llvm_context(); }