Skip to content

Commit

Permalink
[amdgpu] Part2 add runtime (#6482)
Browse files Browse the repository at this point in the history
Issue: #6434

### Brief Summary

1. This is the third part of adding the backend of amdgpu: adding the
runtime part of the implementation. The main code for runtime is llvm ir
generating gcn-isa/object and hsaco (which is a file format that can be
accepted by the module launch api provided by hip)

2. After calling the relevant api to generate the gcn isa/obj, the
linker of llvm (ld.lld) needs to be called to generate the hsaco file
format, so there is a command line call `ld.lld -shared xxx.o -o
xxx.hsaco` in the code, and the temporarily generated file is stored in
the `/tmp/taichi_hsaco/` folder

3. To deal with the problem of multiple `hsaco` files being generated at
the same time, a random number is used to name the related generated
files, as follows: in `JITSessionAMDGPU` there is a `random_num_` and
`tmp_dir_` which are assigned when the `JITSessionAMDGPU` instance is
created. Each `ti.kernel` will be devided into offload-tasks which is
compiled into a separate `hsaco` file. A random number bound to the
`hsaco` file is obtained when the `hsaco` file is generated. Here is an
example of the file after running the `ti example mpm128`:
```
taichi_hsaco/
└── 4858208420434830779
    ├── taichi_amdgcn_10476395765980093855.hsaco
    ├── taichi_amdgcn_10476395765980093855.o
    ├── taichi_amdgcn_11369096326162657620.hsaco
    ├── taichi_amdgcn_11369096326162657620.o
    ├── taichi_amdgcn_11700031850871498261.hsaco
    ├── taichi_amdgcn_11700031850871498261.o
    ├── taichi_amdgcn_14803499569653867868.hsaco
    ├── taichi_amdgcn_14803499569653867868.o
    ├── taichi_amdgcn_14949458395707884954.hsaco
    ├── taichi_amdgcn_14949458395707884954.o
    ├── taichi_amdgcn_15955762247261446379.hsaco
    ├── taichi_amdgcn_15955762247261446379.o
    ├── taichi_amdgcn_16891452471041191610.hsaco
    ├── taichi_amdgcn_16891452471041191610.o
    ├── taichi_amdgcn_17615766226135707772.hsaco
    ├── taichi_amdgcn_17615766226135707772.o
    ├── taichi_amdgcn_18033844193337069056.hsaco
    ├── taichi_amdgcn_18033844193337069056.o
    ├── taichi_amdgcn_5951151729973841331.hsaco
    ├── taichi_amdgcn_5951151729973841331.o
    ├── taichi_amdgcn_6012043323411824926.hsaco
    ├── taichi_amdgcn_6012043323411824926.o
    ├── taichi_amdgcn_6796840558965541322.hsaco
    ├── taichi_amdgcn_6796840558965541322.o
    ├── taichi_amdgcn_6835984424286808860.hsaco
    ├── taichi_amdgcn_6835984424286808860.o
    ├── taichi_amdgcn_7872622170129629907.hsaco
    ├── taichi_amdgcn_7872622170129629907.o
    ├── taichi_amdgcn_8760441738982760858.hsaco
    ├── taichi_amdgcn_8760441738982760858.o
    ├── taichi_amdgcn_9006625347419529255.hsaco
    └── taichi_amdgcn_9006625347419529255.o
```

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
galeselee and pre-commit-ci[bot] authored Jan 11, 2023
1 parent 35a0e5b commit e1165db
Show file tree
Hide file tree
Showing 7 changed files with 380 additions and 2 deletions.
2 changes: 2 additions & 0 deletions cmake/TaichiCore.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,10 @@ if(TI_WITH_LLVM)
llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU)
add_subdirectory(taichi/rhi/amdgpu)
add_subdirectory(taichi/codegen/amdgpu)
add_subdirectory(taichi/runtime/amdgpu)

target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_codegen)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_runtime)
target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi)
endif()

Expand Down
2 changes: 1 addition & 1 deletion taichi/codegen/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ target_include_directories(amdgpu_codegen
)

target_link_libraries(amdgpu_codegen PRIVATE taichi_util)
# target_link_libraries(amdgpu_codegen PRIVATE amdgpu_runtime)
target_link_libraries(amdgpu_codegen PRIVATE amdgpu_runtime)
17 changes: 17 additions & 0 deletions taichi/runtime/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# ./taichi/runtime/amdgpu/CMakeLists.txt

add_library(amdgpu_runtime)
target_sources(amdgpu_runtime
PRIVATE
jit_amdgpu.cpp
)

target_include_directories(amdgpu_runtime
PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/external/eigen
${PROJECT_SOURCE_DIR}/external/spdlog/include
${LLVM_INCLUDE_DIRS}
)

target_link_libraries(amdgpu_runtime PRIVATE amdgpu_rhi)
127 changes: 127 additions & 0 deletions taichi/runtime/amdgpu/jit_amdgpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include "taichi/runtime/amdgpu/jit_amdgpu.h"
#include "taichi/runtime/llvm/llvm_context.h"

namespace taichi {
namespace lang {

#if defined(TI_WITH_AMDGPU)
JITModule *JITSessionAMDGPU ::add_module(std::unique_ptr<llvm::Module> M,
int max_reg) {
auto hsaco = compile_module_to_hsaco(M);
TI_TRACE("hsaco size: {:.2f}KB", hsaco.size() / 1024.0);

void *amdgpu_module;
auto t = Time::get_time();
AMDGPUDriver::get_instance().module_load_data(&amdgpu_module, hsaco.c_str());
TI_TRACE("AMDGPU load data from module time : {}ms",
(Time::get_time() - t) * 1000);
modules.push_back(std::make_unique<JITModuleAMDGPU>(amdgpu_module));
return modules.back().get();
}

std::string JITSessionAMDGPU::compile_module_to_hsaco(
std::unique_ptr<llvm::Module> &llvm_module) {
if (llvm::verifyModule(*llvm_module, &llvm::errs())) {
llvm_module->print(llvm::errs(), nullptr);
TI_WARN("Module broken");
}
using namespace llvm;

if (this->config_->print_kernel_llvm_ir) {
static FileSequenceWriter writer("taichi_kernel_amdgpu_llvm_ir_{:04d}.ll",
"unoptimized LLVM IR (AMDGPU)");
writer.write(llvm_module.get());
}
auto triple_str = llvm_module->getTargetTriple();
std::string error_str;
auto target = llvm::TargetRegistry::lookupTarget(triple_str, error_str);
llvm::TargetOptions options;
std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options,
llvm::Reloc::PIC_, llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive));

llvm_module->setDataLayout(machine->createDataLayout());

llvm::legacy::FunctionPassManager function_pass_manager(llvm_module.get());
llvm::legacy::PassManager module_pass_manager;

module_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
machine->getTargetIRAnalysis()));
function_pass_manager.add(llvm::createTargetTransformInfoWrapperPass(
machine->getTargetIRAnalysis()));

llvm::PassManagerBuilder builder;
builder.OptLevel = 3;
builder.Inliner =
llvm::createFunctionInliningPass(builder.OptLevel, 0, false);
machine->adjustPassManager(builder);
builder.populateFunctionPassManager(function_pass_manager);
builder.populateModulePassManager(module_pass_manager);

machine->Options.MCOptions.AsmVerbose = true;

auto tmp_dir = get_tmp_dir();
uint64 random_num = get_random_num();

auto obj_filename = "taichi_amdgcn_" + std::to_string(random_num) + ".o";
auto hsaco_filename =
"taichi_amdgcn_" + std::to_string(random_num) + ".hsaco";
auto obj_path = tmp_dir + obj_filename;
auto hsaco_path = tmp_dir + hsaco_filename;
std::error_code ec;

llvm::SmallString<0> outstr;
llvm::raw_svector_ostream llvm_stream(outstr);

machine->addPassesToEmitFile(module_pass_manager, llvm_stream, nullptr,
llvm::CGFT_ObjectFile, true);
function_pass_manager.doInitialization();
for (auto func = llvm_module->begin(); func != llvm_module->end(); ++func)
function_pass_manager.run(*func);
function_pass_manager.doFinalization();
module_pass_manager.run(*llvm_module);

std::string obj_str(outstr.begin(), outstr.end());
std::ofstream(obj_path) << obj_str;

TI_TRACE("Loading module...");
[[maybe_unused]] auto _ = AMDGPUContext::get_instance().get_lock_guard();

std::string lld_cmd = "ld.lld -shared " + obj_path + " -o " + hsaco_path;
if (std::system(lld_cmd.c_str()))
TI_ERROR(fmt::format("Generate {} Error", hsaco_filename));

std::string hsaco_str = load_hsaco(hsaco_path);

if (this->config_->print_kernel_llvm_ir_optimized) {
static FileSequenceWriter writer(
"taichi_kernel_amdgpu_llvm_ir_optimized_{:04d}.ll",
"unoptimized LLVM IR (AMDGPU)");
writer.write(llvm_module.get());
}
return hsaco_str;
}

std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
TaichiLLVMContext *tlctx,
CompileConfig *config,
Arch arch) {
TI_ASSERT(arch == Arch::amdgpu);
auto data_layout = llvm::DataLayout(
"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-"
"v16:16-v24:32-"
"v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-"
"n32:64-S32-A5-G1-ni:7");
return std::make_unique<JITSessionAMDGPU>(tlctx, config, data_layout);
}
#else
std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
TaichiLLVMContext *tlctx,
CompileConfig *config,
Arch arch) {
TI_NOT_IMPLEMENTED
}
#endif

} // namespace lang
} // namespace taichi
152 changes: 152 additions & 0 deletions taichi/runtime/amdgpu/jit_amdgpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#include <memory>
#include <utility>
#include <mutex>
#include <random>
#include <unistd.h>

#include "llvm/ADT/StringRef.h"
#include "llvm/Support/DynamicLibrary.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Transforms/InstCombine/InstCombine.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"

#include "taichi/rhi/amdgpu/amdgpu_context.h"
#include "taichi/rhi/amdgpu/amdgpu_driver.h"
#include "taichi/jit/jit_session.h"
#include "taichi/util/lang_util.h"
#include "taichi/program/program.h"
#include "taichi/system/timer.h"
#include "taichi/util/file_sequence_writer.h"
#include "taichi/util/io.h"

#define TI_RUNTIME_HOST
#include "taichi/program/context.h"
#undef TI_RUNTIME_HOST

namespace taichi {
namespace lang {

#if defined(TI_WITH_AMDGPU)

class JITModuleAMDGPU : public JITModule {
private:
void *module_;

public:
explicit JITModuleAMDGPU(void *module) : module_(module) {
}

void *lookup_function(const std::string &name) override {
AMDGPUContext::get_instance().make_current();
void *func = nullptr;
auto t = Time::get_time();
auto err =
AMDGPUDriver::get_instance().module_get_function.call_with_warning(
&func, module_, name.c_str());
if (err) {
TI_ERROR("Cannot look up function {}", name);
}
t = Time::get_time() - t;
TI_TRACE("AMDGPU module_get_function {} costs {} ms", name, t * 1000);
TI_ASSERT(func != nullptr);
return func;
}

void call(const std::string &name,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
launch(name, 1, 1, 0, arg_pointers, arg_sizes);
}

void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t dynamic_shared_mem_bytes,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
auto func = lookup_function(name);
AMDGPUContext::get_instance().launch(func, name, arg_pointers, arg_sizes,
grid_dim, block_dim,
dynamic_shared_mem_bytes);
}

bool direct_dispatch() const override {
return false;
}
};

class JITSessionAMDGPU : public JITSession {
public:
llvm::DataLayout data_layout;

JITSessionAMDGPU(TaichiLLVMContext *tlctx,
CompileConfig *config,
llvm::DataLayout data_layout)
: JITSession(tlctx, config), data_layout(data_layout) {
random_num_ = get_random_num();
char *env_dir = std::getenv("TI_TMP_DIR");
tmp_dir_ = "/tmp/taichi_hsaco/";
if (env_dir) {
tmp_dir_ = env_dir;
if (tmp_dir_[tmp_dir_.size() - 1] != '/') {
tmp_dir_ += '/';
}
}
tmp_dir_ += std::to_string(random_num_) + "/";
create_directories(tmp_dir_);
}

JITModule *add_module(std::unique_ptr<llvm::Module> M, int max_reg) override;

llvm::DataLayout get_data_layout() override {
return data_layout;
}

std::string load_hsaco(const std::string &filename) {
std::ifstream src_file(filename);
if (!src_file.is_open()) {
TI_ERROR(fmt::format("Open {} Error", filename));
}
return std::string(std::istreambuf_iterator<char>(src_file),
(std::istreambuf_iterator<char>()));
}

uint64 get_random_num() {
// Note: ROCm is available only on Linux OS.
static std::random_device device("/dev/urandom");
static std::mt19937_64 *rng = new std::mt19937_64(device());
return (*rng)();
}

std::string get_tmp_dir() {
return tmp_dir_;
}

private:
std::string compile_module_to_hsaco(std::unique_ptr<llvm::Module> &module);
uint64_t random_num_;
std::string tmp_dir_;
};

#endif

std::unique_ptr<JITSession> create_llvm_jit_session_amdgpu(
TaichiLLVMContext *tlctx,
CompileConfig *config,
Arch arch);

} // namespace lang
} // namespace taichi
2 changes: 1 addition & 1 deletion taichi/runtime/llvm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ if (TI_WITH_CUDA)
endif()

if (TI_WITH_AMDGPU)
target_link_libraries(llvm_runtime PRIVATE ${llvm_ptx_libs})
target_link_libraries(llvm_runtime PRIVATE ${llvm_amdgpu_libs})
target_link_libraries(llvm_runtime PRIVATE amdgpu_rhi)
endif()

Expand Down
Loading

0 comments on commit e1165db

Please sign in to comment.