Skip to content

Commit

Permalink
[amdgpu] Part0 add render hardware interface (taichi-dev#6464)
Browse files Browse the repository at this point in the history
Issue: #taichi-dev#6434

### Brief Summary
It contains four parts(`driver`, `context`, `device` and
`caching_allocator`). The code is similar to `cuda/rhi`. However, there
are still some differences between `amdgpu/rhi` and `cuda/rhi`
#### context
1. The method of obtaining the hardware version
2. Context::launch
#### driver
1. ROCm/hip internal functions
#### cmake

The current cmake compilation system is sufficient to support the Unit
test in taichi-dev#6597

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and quadpixels committed May 13, 2023
1 parent 78a7973 commit 2a60142
Show file tree
Hide file tree
Showing 15 changed files with 921 additions and 3 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ jobs:
. .github/workflows/scripts/common-utils.sh
ci-docker-run-amdgpu --name taichi-build \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
/home/dev/taichi/.github/workflows/scripts/build.py
env:
Expand All @@ -302,6 +302,7 @@ jobs:
-DTI_WITH_VULKAN:BOOL=OFF
-DTI_WITH_OPENGL:BOOL=OFF
-DTI_BUILD_TESTS:BOOL=ON
-DTI_WITH_AMDGPU:BOOL=ON
- name: Test
id: test
Expand All @@ -310,7 +311,7 @@ jobs:
. .github/workflows/scripts/common-utils.sh
ci-docker-run-amdgpu --name taichi-test \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.3 \
registry.taichigraphics.com/taichidev-ubuntu18.04.amdgpu:v0.0.5 \
/home/dev/taichi/.github/workflows/scripts/unix_test.sh
env:
PY: '3.8'
Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ if (TI_WITH_CUDA)
set(CUDA_ARCH "cuda")
endif()

if (TI_WITH_AMDGPU)
set(AMDGPU_ARCH "amdgpu")
endif()

if (TI_WITH_DX12)
set(DX12_ARCH "dx12")
endif()
Expand Down
25 changes: 25 additions & 0 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ option(TI_WITH_LLVM "Build with LLVM backends" ON)
option(TI_WITH_METAL "Build with the Metal backend" ON)
option(TI_WITH_CUDA "Build with the CUDA backend" ON)
option(TI_WITH_CUDA_TOOLKIT "Build with the CUDA toolkit" OFF)
option(TI_WITH_AMDGPU "Build with the AMDGPU backend" OFF)
option(TI_WITH_OPENGL "Build with the OpenGL backend" ON)
option(TI_WITH_CC "Build with the C backend" ON)
option(TI_WITH_VULKAN "Build with the Vulkan backend" OFF)
Expand Down Expand Up @@ -34,6 +35,10 @@ if(ANDROID)
set(TI_WITH_DX12 OFF)
endif()

if (TI_WITH_AMDGPU AND TI_WITH_CUDA)
message(WARNING "Compiling CUDA and AMDGPU backends simultaneously")
endif()

if(UNIX AND NOT APPLE)
# Handy helper for Linux
# https://stackoverflow.com/a/32259072/12003165
Expand All @@ -53,13 +58,21 @@ if (APPLE)
set(TI_WITH_CC OFF)
message(WARNING "C backend not supported on OS X. Setting TI_WITH_CC to OFF.")
endif()
if (TI_WITH_AMDGPU)
set(TI_WITH_AMDGPU OFF)
message(WARNING "AMDGPU backend not supported on OS X. Setting TI_WITH_AMDGPU to OFF.")
endif()
endif()

if (WIN32)
if (TI_WITH_CC)
set(TI_WITH_CC OFF)
message(WARNING "C backend not supported on Windows. Setting TI_WITH_CC to OFF.")
endif()
if (TI_WITH_AMDGPU)
set(TI_WITH_AMDGPU OFF)
message(WARNING "AMDGPU backend not supported on Windows. Setting TI_WITH_AMDGPU to OFF.")
endif()
endif()

if(TI_WITH_VULKAN)
Expand Down Expand Up @@ -108,6 +121,12 @@ if (TI_WITH_CUDA)
list(APPEND TAICHI_CORE_SOURCE ${TAICHI_CUDA_RUNTIME_SOURCE})
endif()

if (TI_WITH_AMDGPU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_AMDGPU")
# file(GLOB TAICHI_AMDGPU_RUNTIME_SOURCE "taichi/runtime/amdgpu/runtime.cpp")
list(APPEND TAIHI_CORE_SOURCE ${TAICHI_AMDGPU_RUNTIME_SOURCE})
endif()

if (TI_WITH_DX12)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_DX12")
endif()
Expand Down Expand Up @@ -215,6 +234,12 @@ if(TI_WITH_LLVM)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cuda_rhi)
endif()

if (TI_WITH_AMDGPU)
llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU)
add_subdirectory(taichi/rhi/amdgpu)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi)
endif()

if (TI_WITH_DX12)
llvm_map_components_to_libnames(llvm_directx_libs DirectX)

Expand Down
21 changes: 21 additions & 0 deletions taichi/rhi/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# ./taichi/rhi/amdgpu/CMakeLists.txt

set(AMDGPU_RHI amdgpu_rhi)
add_library(${AMDGPU_RHI})
target_sources(${AMDGPU_RHI}
PRIVATE
amdgpu_device.cpp
amdgpu_caching_allocator.cpp
amdgpu_context.cpp
amdgpu_driver.cpp
)

target_include_directories(${AMDGPU_RHI}
PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/external/eigen
${PROJECT_SOURCE_DIR}/external/spdlog/include
${LLVM_INCLUDE_DIRS}
)

target_link_libraries(${AMDGPU_RHI} PRIVATE interop_rhi)
40 changes: 40 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_caching_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "taichi/rhi/amdgpu/amdgpu_caching_allocator.h"

namespace taichi {
namespace lang {
namespace amdgpu {

AmdgpuCachingAllocator::AmdgpuCachingAllocator(LlvmDevice *device)
: device_(device) {
}

uint64_t *AmdgpuCachingAllocator::allocate(
const LlvmDevice::LlvmRuntimeAllocParams &params) {
uint64_t *ret{nullptr};
auto size_aligned = taichi::iroundup(params.size, taichi_page_size);
auto it_blk = mem_blocks_.lower_bound(size_aligned);

if (it_blk != mem_blocks_.end()) {
size_t remaining_sz = it_blk->first - size_aligned;
if (remaining_sz > 0) {
TI_ASSERT(remaining_sz % taichi_page_size == 0);
auto remaining_head =
reinterpret_cast<uint8_t *>(it_blk->second) + size_aligned;
mem_blocks_.insert(
{remaining_sz, reinterpret_cast<uint64_t *>(remaining_head)});
}
ret = it_blk->second;
mem_blocks_.erase(it_blk);
} else {
ret = device_->allocate_llvm_runtime_memory_jit(params);
}
return ret;
}

void AmdgpuCachingAllocator::release(size_t sz, uint64_t *ptr) {
mem_blocks_.insert({sz, ptr});
}

} // namespace amdgpu
} // namespace lang
} // namespace taichi
28 changes: 28 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_caching_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#pragma once

#include "taichi/common/core.h"
#include "taichi/math/arithmetic.h"
#include "taichi/rhi/llvm/llvm_device.h"
#include "taichi/inc/constants.h"
#include <stdint.h>
#include <map>

namespace taichi {
namespace lang {
namespace amdgpu {

class AmdgpuCachingAllocator {
public:
AmdgpuCachingAllocator(LlvmDevice *device);

uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams &params);
void release(size_t sz, uint64_t *ptr);

private:
std::multimap<size_t, uint64_t *> mem_blocks_;
LlvmDevice *device_{nullptr};
};

} // namespace amdgpu
} // namespace lang
} // namespace taichi
93 changes: 93 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#define TI_RUNTIME_HOST
#include "amdgpu_context.h"

#include <unordered_map>
#include <mutex>

#include "taichi/util/lang_util.h"
#include "taichi/program/program.h"
#include "taichi/system/threading.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/analysis/offline_cache_util.h"

namespace taichi {
namespace lang {

AMDGPUContext::AMDGPUContext()
: driver_(AMDGPUDriver::get_instance_without_context()) {
dev_count_ = 0;
driver_.init(0);
driver_.device_get_count(&dev_count_);
driver_.device_get(&device_, 0);

char name[128];
driver_.device_get_name(name, 128, device_);

TI_TRACE("Using AMDGPU device [id=0]: {}", name);

driver_.context_create(&context_, 0, device_);

const auto GB = std::pow(1024.0, 3.0);
TI_TRACE("Total memory {:.2f} GB; free memory {:.2f} GB",
get_total_memory() / GB, get_free_memory() / GB);

void *hip_device_prop = std::malloc(HIP_DEVICE_PROPERTIES_STRUCT_SIZE);
driver_.device_get_prop(hip_device_prop, device_);
compute_capability_ = *((int *)hip_device_prop + HIP_DEVICE_GCN_ARCH);
std::free(hip_device_prop);

mcpu_ = fmt::format("gfx{}", compute_capability_);

TI_TRACE("Emitting AMDGPU code for {}", mcpu_);
}

std::size_t AMDGPUContext::get_total_memory() {
std::size_t ret, _;
driver_.mem_get_info(&_, &ret);
return ret;
}

std::size_t AMDGPUContext::get_free_memory() {
std::size_t ret, _;
driver_.mem_get_info(&ret, &_);
return ret;
}

std::string AMDGPUContext::get_device_name() {
constexpr uint32_t kMaxNameStringLength = 128;
char name[kMaxNameStringLength];
driver_.device_get_name(name, kMaxNameStringLength /*=128*/, device_);
std::string str(name);
return str;
}

void AMDGPUContext::launch(void *func,
const std::string &task_name,
void *arg_pointers,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes) {
if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock_);
void *config[] = {(void *)0x01, const_cast<void *>(arg_pointers),
(void *)0x02, &arg_bytes, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
dynamic_shared_mem_bytes, nullptr, nullptr,
reinterpret_cast<void **>(&config));
}
if (debug_) {
driver_.stream_synchronize(nullptr);
}
}

AMDGPUContext::~AMDGPUContext() {
}

AMDGPUContext &AMDGPUContext::get_instance() {
static auto context = new AMDGPUContext();
return *context;
}

} // namespace lang
} // namespace taichi
99 changes: 99 additions & 0 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#pragma once

#include <mutex>
#include <unordered_map>
#include <thread>

#include "taichi/program/kernel_profiler.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"

namespace taichi {
namespace lang {

class AMDGPUDriver;

class AMDGPUContext {
private:
void *device_;
void *context_;
int dev_count_;
int compute_capability_;
std::string mcpu_;
std::mutex lock_;
AMDGPUDriver &driver_;
bool debug_;

public:
AMDGPUContext();

std::size_t get_total_memory();
std::size_t get_free_memory();
std::string get_device_name();

bool detected() const {
return dev_count_ != 0;
}

void launch(void *func,
const std::string &task_name,
void *arg_pointers,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes);

void set_debug(bool debug) {
debug_ = debug;
}

std::string get_mcpu() const {
return mcpu_;
}

void *get_context() {
return context_;
}

void make_current() {
driver_.context_set_current(context_);
}

int get_compute_capability() const {
return compute_capability_;
}

~AMDGPUContext();

class ContextGuard {
private:
void *old_ctx_;
void *new_ctx_;

public:
ContextGuard(AMDGPUContext *new_ctx)
: old_ctx_(nullptr), new_ctx_(new_ctx) {
AMDGPUDriver::get_instance().context_get_current(&old_ctx_);
if (old_ctx_ != new_ctx)
new_ctx->make_current();
}

~ContextGuard() {
if (old_ctx_ != new_ctx_) {
AMDGPUDriver::get_instance().context_set_current(old_ctx_);
}
}
};

ContextGuard get_guard() {
return ContextGuard(this);
}

std::unique_lock<std::mutex> get_lock_guard() {
return std::unique_lock<std::mutex>(lock_);
}

static AMDGPUContext &get_instance();
};

} // namespace lang
} // namespace taichi
Loading

0 comments on commit 2a60142

Please sign in to comment.