Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[amdgpu] Part3 update runtime module #6486

Merged
merged 12 commits into from
Dec 30, 2022
296 changes: 202 additions & 94 deletions taichi/runtime/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#ifdef TI_WITH_AMDGPU
#include "llvm/IR/IntrinsicsAMDGPU.h"
#endif // TI_WITH_AMDGPU
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
Expand Down Expand Up @@ -334,22 +337,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
auto ctx = get_this_thread_context();
std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
bool ret = true,
std::vector<llvm::Type *> types = {},
Expand Down Expand Up @@ -399,93 +387,143 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
TaichiLLVMContext::mark_inline(func);
};

patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

link_module_with_cuda_libdevice(module);
#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32",
Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32",
Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);

link_module_with_cuda_libdevice(module);

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
}
}

// runtime_module->print(llvm::errs(), nullptr);
}

// runtime_module->print(llvm::errs(), nullptr);
if (arch_ == Arch::amdgpu) {
module->setTargetTriple("amdgcn-amd-amdhsa");
#ifdef TI_WITH_AMDGPU
for (auto &f : *module) {
f.addFnAttr("target-cpu", "");
f.addFnAttr("target-features", "");
for (auto &bb : f) {
std::vector<llvm::AllocaInst *> alloca_inst_vec;
for (llvm::Instruction &inst : bb) {
galeselee marked this conversation as resolved.
Show resolved Hide resolved
llvm::AllocaInst *now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
if (!now_alloca ||
now_alloca->getType()->getAddressSpace() != (unsigned)0) {
continue;
}
alloca_inst_vec.push_back(now_alloca);
}
for (auto &allocainst : alloca_inst_vec) {
auto alloca_type = allocainst->getAllocatedType();
llvm::IRBuilder<> builder(allocainst);
auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
auto *addrspacecast =
builder.CreateAddrSpaceCast(new_alloca, new_type);
allocainst->replaceAllUsesWith(addrspacecast);
allocainst->eraseFromParent();
}
}
}
patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
#endif
}
}

return module;
Expand Down Expand Up @@ -711,6 +749,11 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func,
}
}

void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) {
func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
}

void TaichiLLVMContext::eliminate_unused_functions(
llvm::Module *module,
std::function<bool(const std::string &)> export_indicator) {
Expand Down Expand Up @@ -817,6 +860,71 @@ void TaichiLLVMContext::update_runtime_jit_module(
}
}

if (arch_ == Arch::amdgpu) {
for (auto &f : *module) {
bool is_kernel = false;
const std::string func_name = f.getName().str();
if (starts_with(func_name, "runtime_")) {
mark_function_as_amdgpu_kernel(&f);
is_kernel = true;
}
if (!is_kernel && !f.isDeclaration())
f.setLinkage(llvm::Function::PrivateLinkage);
}
std::vector<llvm::Function *> global_func;
galeselee marked this conversation as resolved.
Show resolved Hide resolved
for (auto &f : *module) {
if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
global_func.push_back(&f);
}
for (auto &f : global_func) {
llvm::FunctionType *func_type = f->getFunctionType();
galeselee marked this conversation as resolved.
Show resolved Hide resolved
std::vector<llvm::Type *> new_func_params;
for (auto &arg : f->args()) {
if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
auto new_type = llvm::PointerType::get(
arg.getType()->getPointerElementType(), unsigned(1));
new_func_params.push_back(new_type);
} else {
new_func_params.push_back(arg.getType());
}
}
auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
new_func_params, false);
auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
f->getAddressSpace());
// NF->copyAttributesFrom(f);
new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
galeselee marked this conversation as resolved.
Show resolved Hide resolved
new_func->setComdat(f->getComdat());
f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
new_func->takeName(f);
new_func->getBasicBlockList().splice(new_func->begin(),
f->getBasicBlockList());
for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
I2 = new_func->arg_begin();
I != E; ++I, ++I2) {
if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
auto &front_bb = new_func->getBasicBlockList().front();
llvm::Instruction *addrspacecast =
new AddrSpaceCastInst(I2, I->getType());
front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
addrspacecast);
I->replaceAllUsesWith(addrspacecast);
I2->takeName(&*I);
} else {
I->replaceAllUsesWith(&*I2);
I2->takeName(&*I);
}
}

SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
galeselee marked this conversation as resolved.
Show resolved Hide resolved
f->getAllMetadata(MDs);
for (auto [KindID, Node] : MDs)
new_func->addMetadata(KindID, *Node);
f->eraseFromParent();
}
}

eliminate_unused_functions(module.get(), [](std::string func_name) {
return starts_with(func_name, "runtime_") ||
starts_with(func_name, "LLVMRuntime_");
Expand Down
2 changes: 2 additions & 0 deletions taichi/runtime/llvm/llvm_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ class TaichiLLVMContext {

void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0);

void mark_function_as_amdgpu_kernel(llvm::Function *func);

void fetch_this_thread_struct_module();
llvm::Module *get_this_thread_runtime_module();
llvm::Function *get_runtime_function(const std::string &name);
Expand Down
17 changes: 17 additions & 0 deletions taichi/runtime/llvm/llvm_runtime_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
}
#endif

#if defined(TI_WITH_AMDGPU)
if (config.arch == Arch::amdgpu) {
AMDGPUContext::get_instance().set_debug(config.debug);
jim19930609 marked this conversation as resolved.
Show resolved Hide resolved
device_ = std::make_shared<amdgpu::AMDGPUDevice>();

this->maybe_initialize_amdgpu_llvm_context();
}
#endif

#ifdef TI_WITH_DX12
if (config.arch == Arch::dx12) {
// FIXME: add dx12 device.
Expand Down Expand Up @@ -149,6 +158,14 @@ void LlvmRuntimeExecutor::maybe_initialize_cuda_llvm_context() {
}
}

void LlvmRuntimeExecutor::maybe_initialize_amdgpu_llvm_context() {
if (config_->arch == Arch::amdgpu && llvm_context_device_ == nullptr) {
llvm_context_device_ =
std::make_unique<TaichiLLVMContext>(config_, Arch::amdgpu);
llvm_context_device_->init_runtime_jit_module();
}
}

void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager,
uint64 *result_buffer) {
auto list_manager_len = runtime_query<int32>("ListManager_get_num_elements",
Expand Down
2 changes: 2 additions & 0 deletions taichi/runtime/llvm/llvm_runtime_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ class LlvmRuntimeExecutor {
*/
void maybe_initialize_cuda_llvm_context();

void maybe_initialize_amdgpu_llvm_context();

void finalize();

uint64 fetch_result_uint64(int i, uint64 *result_buffer);
Expand Down
4 changes: 4 additions & 0 deletions taichi/runtime/program_impls/llvm/llvm_program.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ class LlvmProgramImpl : public ProgramImpl {
runtime_exec_->maybe_initialize_cuda_llvm_context();
}

void maybe_initialize_amdgpu_llvm_context() {
runtime_exec_->maybe_initialize_amdgpu_llvm_context();
}

uint64 fetch_result_uint64(int i, uint64 *result_buffer) override {
return runtime_exec_->fetch_result_uint64(i, result_buffer);
}
Expand Down