Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[amdgpu] Update amdgpu module call #7022

Merged
merged 14 commits into from
Jan 6, 2023
2 changes: 1 addition & 1 deletion taichi/codegen/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
task.block_dim);
cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
{&context});
{&context}, {});
}

// copy data back to host
Expand Down
2 changes: 1 addition & 1 deletion taichi/inc/archs.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ PER_ARCH(opengl) // OpenGL Compute Shaders
PER_ARCH(dx11) // Microsoft DirectX 11, WIP
PER_ARCH(dx12) // Microsoft DirectX 12, WIP
PER_ARCH(opencl) // OpenCL, N/A
PER_ARCH(amdgpu) // AMD GPU, N/A
PER_ARCH(amdgpu) // AMD GPU, WIP
PER_ARCH(vulkan) // Vulkan
PER_ARCH(gles) // OpenGL ES
34 changes: 20 additions & 14 deletions taichi/jit/jit_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <memory>
#include <functional>
#include <tuple>

#include "taichi/inc/constants.h"
#include "taichi/util/lang_util.h"
Expand Down Expand Up @@ -33,31 +34,36 @@ class JITModule {
return ret;
}

static std::vector<void *> get_arg_pointers() {
return std::vector<void *>();
inline std::tuple<std::vector<void *>, std::vector<int> > get_arg_pointers() {
return std::make_tuple(std::vector<void *>(), std::vector<int>());
}

template <typename... Args, typename T>
static std::vector<void *> get_arg_pointers(T &t, Args &...args) {
auto ret = get_arg_pointers(args...);
ret.insert(ret.begin(), &t);
return ret;
inline std::tuple<std::vector<void *>, std::vector<int> > get_arg_pointers(
T &t,
Args &...args) {
auto [arg_pointers, arg_sizes] = get_arg_pointers(args...);
arg_pointers.insert(arg_pointers.begin(), &t);
arg_sizes.insert(arg_sizes.begin(), sizeof(t));
return std::make_tuple(arg_pointers, arg_sizes);
}

// Note: **call** is for serial functions
// Note: args must pass by value
// Note: AMDGPU need to pass args by extra_arg currently
template <typename... Args>
void call(const std::string &name, Args... args) {
if (direct_dispatch()) {
get_function<Args...>(name)(args...);
} else {
auto arg_pointers = JITModule::get_arg_pointers(args...);
call(name, arg_pointers);
auto [arg_pointers, arg_sizes] = JITModule::get_arg_pointers(args...);
call(name, arg_pointers, arg_sizes);
}
}

virtual void call(const std::string &name,
const std::vector<void *> &arg_pointers) {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) {
TI_NOT_IMPLEMENTED
}

Expand All @@ -69,20 +75,20 @@ class JITModule {
std::size_t block_dim,
std::size_t shared_mem_bytes,
Args... args) {
auto arg_pointers = JITModule::get_arg_pointers(args...);
launch(name, grid_dim, block_dim, shared_mem_bytes, arg_pointers);
auto [arg_pointers, arg_sizes] = JITModule::get_arg_pointers(args...);
launch(name, grid_dim, block_dim, shared_mem_bytes, arg_pointers,
arg_sizes);
}

virtual void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t shared_mem_bytes,
const std::vector<void *> &arg_pointers) {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) {
TI_NOT_IMPLEMENTED
}

// directly call the function (e.g. on CPU), or via another runtime system
// (e.g. cudaLaunch)?
virtual bool direct_dispatch() const = 0;

virtual ~JITModule() {
Expand Down
65 changes: 60 additions & 5 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,72 @@ std::string AMDGPUContext::get_device_name() {
return str;
}

int AMDGPUContext::get_args_byte(std::vector<int> arg_sizes) {
int byte_cnt = 0;
int naive_add = 0;
for (auto &size : arg_sizes) {
naive_add += size;
if (size < 32) {
if ((byte_cnt + size) % 32 > (byte_cnt) % 32 ||
(byte_cnt + size) % 32 == 0)
byte_cnt += size;
else
byte_cnt += 32 - byte_cnt % 32 + size;
} else {
if (byte_cnt % 32 != 0)
byte_cnt += 32 - byte_cnt % 32 + size;
else
byte_cnt += size;
}
}
return byte_cnt;
}

void AMDGPUContext::pack_args(std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
char *arg_packed) {
int byte_cnt = 0;
for (int ii = 0; ii < arg_pointers.size(); ii++) {
// The parameter is taken as a vec4
if (arg_sizes[ii] < 32) {
if ((byte_cnt + arg_sizes[ii]) % 32 > (byte_cnt % 32) ||
(byte_cnt + arg_sizes[ii]) % 32 == 0) {
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
} else {
int padding_size = 32 - byte_cnt % 32;
byte_cnt += padding_size;
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
}
} else {
if (byte_cnt % 32 != 0) {
int padding_size = 32 - byte_cnt % 32;
byte_cnt += padding_size;
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
} else {
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
}
}
}
}

void AMDGPUContext::launch(void *func,
const std::string &task_name,
void *arg_pointers,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes) {
std::size_t dynamic_shared_mem_bytes) {
auto pack_size = get_args_byte(arg_sizes);
char *packed_arg = (char *)std::malloc(pack_size);
pack_args(arg_pointers, arg_sizes, packed_arg);
if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock_);
void *config[] = {(void *)0x01, const_cast<void *>(arg_pointers),
(void *)0x02, &arg_bytes, (void *)0x03};
void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02,
(void *)&pack_size, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
dynamic_shared_mem_bytes, nullptr, nullptr,
reinterpret_cast<void **>(&config));
Expand Down
12 changes: 9 additions & 3 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,19 @@ class AMDGPUContext {
return dev_count_ != 0;
}

void pack_args(std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
char *arg_packed);

int get_args_byte(std::vector<int> arg_sizes);

void launch(void *func,
const std::string &task_name,
void *arg_pointers,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes);
std::size_t dynamic_shared_mem_bytes);

void set_debug(bool debug) {
debug_ = debug;
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/cuda/cuda_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ std::string CUDAContext::get_device_name() {
void CUDAContext::launch(void *func,
const std::string &task_name,
std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes) {
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/cuda/cuda_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class CUDAContext {
void launch(void *func,
const std::string &task_name,
std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes);
Expand Down
13 changes: 8 additions & 5 deletions taichi/runtime/cuda/jit_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,21 @@ class JITModuleCUDA : public JITModule {
}

void call(const std::string &name,
const std::vector<void *> &arg_pointers) override {
launch(name, 1, 1, 0, arg_pointers);
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
launch(name, 1, 1, 0, arg_pointers, arg_sizes);
}

void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t dynamic_shared_mem_bytes,
const std::vector<void *> &arg_pointers) override {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
auto func = lookup_function(name);
CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
block_dim, dynamic_shared_mem_bytes);
CUDAContext::get_instance().launch(func, name, arg_pointers, arg_sizes,
grid_dim, block_dim,
dynamic_shared_mem_bytes);
}

bool direct_dispatch() const override {
Expand Down