forked from taichi-dev/taichi
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[amdgpu] Part3 update runtime module (taichi-dev#6486)
Issue: taichi-dev#6434 ### Brief Summary 1. This is a special part of the Tacihi runtime module for the `AMDGPU` backend. Tacihi's runtime module uses `clang++` to generate `LLVM IR` is different in memory allocation differs from the cpu-generated `LLVM IR`. The following is an example. ``` C/C++ code void func(int *a, int *b) { *a = *b; } x86_64 backend LLVM IR define dso_local void @cpu_func(i32* %0, i32* %1) taichi-dev#2 { %3 = alloca i32*, align 8 %4 = alloca i32*, align 8 store i32* %0, i32** %3, align 8 store i32* %1, i32** %4, align 8 %5 = load i32*, i32** %4, align 8 %6 = load i32, i32* %5, align 4 %7 = load i32*, i32** %3, align 8 store i32 %6, i32* %7, align 4 ret void } __global__ function on AMDGPU define protected amdgpu_kernel void @global_func(i32 addrspace(1)* %0, i32 addrspace(1)* %1) taichi-dev#4 { %3 = alloca i32*, align 8, addrspace(5) %4 = alloca i32*, align 8, addrspace(5) %5 = alloca i32*, align 8, addrspace(5) %6 = alloca i32*, align 8, addrspace(5) %7 = addrspacecast i32* addrspace(5)* %3 to i32** %8 = addrspacecast i32* addrspace(5)* %4 to i32** %9 = addrspacecast i32* addrspace(5)* %5 to i32** %10 = addrspacecast i32* addrspace(5)* %6 to i32** %11 = addrspacecast i32 addrspace(1)* %0 to i32* store i32* %11, i32** %7, align 8 %12 = load i32*, i32** %7, align 8 %13 = addrspacecast i32 addrspace(1)* %1 to i32* store i32* %13, i32** %8, align 8 %14 = load i32*, i32** %8, align 8 store i32* %12, i32** %9, align 8 store i32* %14, i32** %10, align 8 %15 = load i32*, i32** %10, align 8 %16 = load i32, i32* %15, align 4 %17 = load i32*, i32** %9, align 8 store i32 %16, i32* %17, align 4 ret void } __device__ function on AMDGPU define hidden void @device_func(i32* %0, i32* %1) taichi-dev#2 { %3 = alloca i32*, align 8, addrspace(5) %4 = alloca i32*, align 8, addrspace(5) %5 = addrspacecast i32* addrspace(5)* %3 to i32** %6 = addrspacecast i32* addrspace(5)* %4 to i32** store i32* %0, i32** %5, align 8 store i32* %1, i32** %6, align 8 %7 = load i32*, i32** %6, align 8 %8 = load i32, i32* %7, align 4 %9 = load i32*, i32** %5, align 8 store i32 %8, i32* %9, align 4 ret void } ``` 2. There are some differences in the place about `allocainst`, specifically about addrspace (for `AMDGPU`, [this](https://llvm.org/docs/AMDGPUUsage.html#address-spaces) will be helpful). I have not found documentation describing how to write the correct `LLVM IR` on `AMDGPU`, through my observation of the `LLVM IR` generated by `clang++/hipcc`. We need to deal with the arguments of the `__global__` function and the `allocainst` (including specifying the addrspace of `allocainst` and performing addrspace-cast) while for the `__device__` function we do not need to deal with the arguments of the function. Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
Showing
5 changed files
with
282 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#pragma once | ||
|
||
#include "llvm/IR/LLVMContext.h" | ||
#include "llvm/IR/LegacyPassManager.h" | ||
#include "llvm/IR/Function.h" | ||
#include "llvm/Pass.h" | ||
#include "llvm/IR/Module.h" | ||
#include "llvm/Transforms/IPO.h" | ||
#include "llvm/Transforms/IPO/PassManagerBuilder.h" | ||
#include "llvm/IR/Instructions.h" | ||
#include "llvm/IR/IRBuilder.h" | ||
|
||
#if defined(TI_WITH_AMDGPU) | ||
#include "taichi/rhi/amdgpu/amdgpu_context.h" | ||
#endif | ||
|
||
namespace taichi { | ||
namespace lang { | ||
using namespace llvm; | ||
#if defined(TI_WITH_AMDGPU) | ||
struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { | ||
static char ID; | ||
AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) { | ||
} | ||
bool runOnFunction(llvm::Function &f) override { | ||
f.addFnAttr("target-cpu", | ||
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); | ||
f.addFnAttr("target-features", ""); | ||
for (auto &bb : f) { | ||
std::vector<AllocaInst *> alloca_inst_vec; | ||
for (Instruction &inst : bb) { | ||
AllocaInst *now_alloca = dyn_cast<AllocaInst>(&inst); | ||
if (!now_alloca || | ||
now_alloca->getType()->getAddressSpace() != (unsigned)0) { | ||
continue; | ||
} | ||
alloca_inst_vec.push_back(now_alloca); | ||
} | ||
for (auto &allocainst : alloca_inst_vec) { | ||
auto alloca_type = allocainst->getAllocatedType(); | ||
IRBuilder<> builder(allocainst); | ||
auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); | ||
auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); | ||
new_alloca->setAlignment(Align(allocainst->getAlign().value())); | ||
auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); | ||
allocainst->replaceAllUsesWith(addrspacecast); | ||
allocainst->eraseFromParent(); | ||
} | ||
} | ||
return false; | ||
} | ||
}; | ||
|
||
struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { | ||
static char ID; | ||
AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) { | ||
} | ||
bool runOnModule(llvm::Module &M) override { | ||
for (auto &f : M) { | ||
bool is_kernel = false; | ||
const std::string func_name = f.getName().str(); | ||
if (starts_with(func_name, "runtime_")) { | ||
f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); | ||
// ref https://llvm.org/docs/AMDGPUUsage.html | ||
// “amdgpu-flat-work-group-size”=”min,max” | ||
// Specify the minimum and maximum flat work group sizes that will be | ||
// specified when the kernel is dispatched. Generated by the | ||
// amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied | ||
// default value is 1,1024. | ||
f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); | ||
is_kernel = true; | ||
} | ||
if (!is_kernel && !f.isDeclaration()) | ||
f.setLinkage(llvm::Function::PrivateLinkage); | ||
} | ||
std::vector<llvm::Function *> kernel_function; | ||
for (auto &f : M) { | ||
if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) | ||
kernel_function.push_back(&f); | ||
} | ||
for (auto &f : kernel_function) { | ||
llvm::FunctionType *func_type = f->getFunctionType(); | ||
std::vector<llvm::Type *> new_func_params; | ||
for (auto &arg : f->args()) { | ||
if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { | ||
auto new_type = llvm::PointerType::get( | ||
arg.getType()->getPointerElementType(), unsigned(1)); | ||
new_func_params.push_back(new_type); | ||
} else { | ||
new_func_params.push_back(arg.getType()); | ||
} | ||
} | ||
auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), | ||
new_func_params, false); | ||
auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), | ||
f->getAddressSpace()); | ||
new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); | ||
new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); | ||
new_func->addFnAttr( | ||
"target-cpu", | ||
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); | ||
new_func->setComdat(f->getComdat()); | ||
f->getParent()->getFunctionList().insert(f->getIterator(), new_func); | ||
new_func->takeName(f); | ||
new_func->getBasicBlockList().splice(new_func->begin(), | ||
f->getBasicBlockList()); | ||
for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), | ||
I2 = new_func->arg_begin(); | ||
I != E; ++I, ++I2) { | ||
if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { | ||
auto &front_bb = new_func->getBasicBlockList().front(); | ||
llvm::Instruction *addrspacecast = | ||
new AddrSpaceCastInst(I2, I->getType()); | ||
front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), | ||
addrspacecast); | ||
I->replaceAllUsesWith(addrspacecast); | ||
I2->takeName(&*I); | ||
} else { | ||
I->replaceAllUsesWith(&*I2); | ||
I2->takeName(&*I); | ||
} | ||
} | ||
|
||
f->eraseFromParent(); | ||
} | ||
return false; | ||
} | ||
}; | ||
|
||
char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0; | ||
char AMDGPUConvertFuncParamAddressSpacePass::ID = 0; | ||
#endif | ||
|
||
} // namespace lang | ||
} // namespace taichi |
Oops, something went wrong.