diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake index 7f11eb724a68a..6b94c0dbb9e51 100644 --- a/cmake/TaichiCore.cmake +++ b/cmake/TaichiCore.cmake @@ -242,8 +242,10 @@ if(TI_WITH_LLVM) llvm_map_components_to_libnames(llvm_amdgpu_libs AMDGPU) add_subdirectory(taichi/rhi/amdgpu) add_subdirectory(taichi/codegen/amdgpu) + add_subdirectory(taichi/runtime/amdgpu) target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_codegen) + target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_runtime) target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE amdgpu_rhi) endif() diff --git a/taichi/codegen/amdgpu/CMakeLists.txt b/taichi/codegen/amdgpu/CMakeLists.txt index fa30cd7f9c736..9c5a04d589c2b 100644 --- a/taichi/codegen/amdgpu/CMakeLists.txt +++ b/taichi/codegen/amdgpu/CMakeLists.txt @@ -14,4 +14,4 @@ target_include_directories(amdgpu_codegen ) target_link_libraries(amdgpu_codegen PRIVATE taichi_util) -# target_link_libraries(amdgpu_codegen PRIVATE amdgpu_runtime) +target_link_libraries(amdgpu_codegen PRIVATE amdgpu_runtime) diff --git a/taichi/runtime/amdgpu/CMakeLists.txt b/taichi/runtime/amdgpu/CMakeLists.txt new file mode 100644 index 0000000000000..0edafc7575fc9 --- /dev/null +++ b/taichi/runtime/amdgpu/CMakeLists.txt @@ -0,0 +1,17 @@ +# ./taichi/runtime/amdgpu/CMakeLists.txt + +add_library(amdgpu_runtime) +target_sources(amdgpu_runtime + PRIVATE + jit_amdgpu.cpp + ) + +target_include_directories(amdgpu_runtime + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/external/eigen + ${PROJECT_SOURCE_DIR}/external/spdlog/include + ${LLVM_INCLUDE_DIRS} + ) + +target_link_libraries(amdgpu_runtime PRIVATE amdgpu_rhi) diff --git a/taichi/runtime/amdgpu/jit_amdgpu.cpp b/taichi/runtime/amdgpu/jit_amdgpu.cpp new file mode 100644 index 0000000000000..81ce9cd0df863 --- /dev/null +++ b/taichi/runtime/amdgpu/jit_amdgpu.cpp @@ -0,0 +1,127 @@ +#include "taichi/runtime/amdgpu/jit_amdgpu.h" +#include "taichi/runtime/llvm/llvm_context.h" + +namespace taichi { +namespace lang { + +#if defined(TI_WITH_AMDGPU) +JITModule *JITSessionAMDGPU ::add_module(std::unique_ptr M, + int max_reg) { + auto hsaco = compile_module_to_hsaco(M); + TI_TRACE("hsaco size: {:.2f}KB", hsaco.size() / 1024.0); + + void *amdgpu_module; + auto t = Time::get_time(); + AMDGPUDriver::get_instance().module_load_data(&amdgpu_module, hsaco.c_str()); + TI_TRACE("AMDGPU load data from module time : {}ms", + (Time::get_time() - t) * 1000); + modules.push_back(std::make_unique(amdgpu_module)); + return modules.back().get(); +} + +std::string JITSessionAMDGPU::compile_module_to_hsaco( + std::unique_ptr &llvm_module) { + if (llvm::verifyModule(*llvm_module, &llvm::errs())) { + llvm_module->print(llvm::errs(), nullptr); + TI_WARN("Module broken"); + } + using namespace llvm; + + if (this->config_->print_kernel_llvm_ir) { + static FileSequenceWriter writer("taichi_kernel_amdgpu_llvm_ir_{:04d}.ll", + "unoptimized LLVM IR (AMDGPU)"); + writer.write(llvm_module.get()); + } + auto triple_str = llvm_module->getTargetTriple(); + std::string error_str; + auto target = llvm::TargetRegistry::lookupTarget(triple_str, error_str); + llvm::TargetOptions options; + std::unique_ptr machine(target->createTargetMachine( + triple_str, AMDGPUContext::get_instance().get_mcpu(), "", options, + llvm::Reloc::PIC_, llvm::CodeModel::Small, llvm::CodeGenOpt::Aggressive)); + + llvm_module->setDataLayout(machine->createDataLayout()); + + llvm::legacy::FunctionPassManager function_pass_manager(llvm_module.get()); + llvm::legacy::PassManager module_pass_manager; + + module_pass_manager.add(llvm::createTargetTransformInfoWrapperPass( + machine->getTargetIRAnalysis())); + function_pass_manager.add(llvm::createTargetTransformInfoWrapperPass( + machine->getTargetIRAnalysis())); + + llvm::PassManagerBuilder builder; + builder.OptLevel = 3; + builder.Inliner = + llvm::createFunctionInliningPass(builder.OptLevel, 0, false); + machine->adjustPassManager(builder); + builder.populateFunctionPassManager(function_pass_manager); + builder.populateModulePassManager(module_pass_manager); + + machine->Options.MCOptions.AsmVerbose = true; + + auto tmp_dir = get_tmp_dir(); + uint64 random_num = get_random_num(); + + auto obj_filename = "taichi_amdgcn_" + std::to_string(random_num) + ".o"; + auto hsaco_filename = + "taichi_amdgcn_" + std::to_string(random_num) + ".hsaco"; + auto obj_path = tmp_dir + obj_filename; + auto hsaco_path = tmp_dir + hsaco_filename; + std::error_code ec; + + llvm::SmallString<0> outstr; + llvm::raw_svector_ostream llvm_stream(outstr); + + machine->addPassesToEmitFile(module_pass_manager, llvm_stream, nullptr, + llvm::CGFT_ObjectFile, true); + function_pass_manager.doInitialization(); + for (auto func = llvm_module->begin(); func != llvm_module->end(); ++func) + function_pass_manager.run(*func); + function_pass_manager.doFinalization(); + module_pass_manager.run(*llvm_module); + + std::string obj_str(outstr.begin(), outstr.end()); + std::ofstream(obj_path) << obj_str; + + TI_TRACE("Loading module..."); + [[maybe_unused]] auto _ = AMDGPUContext::get_instance().get_lock_guard(); + + std::string lld_cmd = "ld.lld -shared " + obj_path + " -o " + hsaco_path; + if (std::system(lld_cmd.c_str())) + TI_ERROR(fmt::format("Generate {} Error", hsaco_filename)); + + std::string hsaco_str = load_hsaco(hsaco_path); + + if (this->config_->print_kernel_llvm_ir_optimized) { + static FileSequenceWriter writer( + "taichi_kernel_amdgpu_llvm_ir_optimized_{:04d}.ll", + "unoptimized LLVM IR (AMDGPU)"); + writer.write(llvm_module.get()); + } + return hsaco_str; +} + +std::unique_ptr create_llvm_jit_session_amdgpu( + TaichiLLVMContext *tlctx, + CompileConfig *config, + Arch arch) { + TI_ASSERT(arch == Arch::amdgpu); + auto data_layout = llvm::DataLayout( + "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-" + "v16:16-v24:32-" + "v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-" + "n32:64-S32-A5-G1-ni:7"); + return std::make_unique(tlctx, config, data_layout); +} +#else +std::unique_ptr create_llvm_jit_session_amdgpu( + TaichiLLVMContext *tlctx, + CompileConfig *config, + Arch arch) { + TI_NOT_IMPLEMENTED +} +#endif + +} // namespace lang +} // namespace taichi diff --git a/taichi/runtime/amdgpu/jit_amdgpu.h b/taichi/runtime/amdgpu/jit_amdgpu.h new file mode 100644 index 0000000000000..740048b01482c --- /dev/null +++ b/taichi/runtime/amdgpu/jit_amdgpu.h @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" + +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/jit/jit_session.h" +#include "taichi/util/lang_util.h" +#include "taichi/program/program.h" +#include "taichi/system/timer.h" +#include "taichi/util/file_sequence_writer.h" +#include "taichi/util/io.h" + +#define TI_RUNTIME_HOST +#include "taichi/program/context.h" +#undef TI_RUNTIME_HOST + +namespace taichi { +namespace lang { + +#if defined(TI_WITH_AMDGPU) + +class JITModuleAMDGPU : public JITModule { + private: + void *module_; + + public: + explicit JITModuleAMDGPU(void *module) : module_(module) { + } + + void *lookup_function(const std::string &name) override { + AMDGPUContext::get_instance().make_current(); + void *func = nullptr; + auto t = Time::get_time(); + auto err = + AMDGPUDriver::get_instance().module_get_function.call_with_warning( + &func, module_, name.c_str()); + if (err) { + TI_ERROR("Cannot look up function {}", name); + } + t = Time::get_time() - t; + TI_TRACE("AMDGPU module_get_function {} costs {} ms", name, t * 1000); + TI_ASSERT(func != nullptr); + return func; + } + + void call(const std::string &name, + const std::vector &arg_pointers, + const std::vector &arg_sizes) override { + launch(name, 1, 1, 0, arg_pointers, arg_sizes); + } + + void launch(const std::string &name, + std::size_t grid_dim, + std::size_t block_dim, + std::size_t dynamic_shared_mem_bytes, + const std::vector &arg_pointers, + const std::vector &arg_sizes) override { + auto func = lookup_function(name); + AMDGPUContext::get_instance().launch(func, name, arg_pointers, arg_sizes, + grid_dim, block_dim, + dynamic_shared_mem_bytes); + } + + bool direct_dispatch() const override { + return false; + } +}; + +class JITSessionAMDGPU : public JITSession { + public: + llvm::DataLayout data_layout; + + JITSessionAMDGPU(TaichiLLVMContext *tlctx, + CompileConfig *config, + llvm::DataLayout data_layout) + : JITSession(tlctx, config), data_layout(data_layout) { + random_num_ = get_random_num(); + char *env_dir = std::getenv("TI_TMP_DIR"); + tmp_dir_ = "/tmp/taichi_hsaco/"; + if (env_dir) { + tmp_dir_ = env_dir; + if (tmp_dir_[tmp_dir_.size() - 1] != '/') { + tmp_dir_ += '/'; + } + } + tmp_dir_ += std::to_string(random_num_) + "/"; + create_directories(tmp_dir_); + } + + JITModule *add_module(std::unique_ptr M, int max_reg) override; + + llvm::DataLayout get_data_layout() override { + return data_layout; + } + + std::string load_hsaco(const std::string &filename) { + std::ifstream src_file(filename); + if (!src_file.is_open()) { + TI_ERROR(fmt::format("Open {} Error", filename)); + } + return std::string(std::istreambuf_iterator(src_file), + (std::istreambuf_iterator())); + } + + uint64 get_random_num() { + // Note: ROCm is available only on Linux OS. + static std::random_device device("/dev/urandom"); + static std::mt19937_64 *rng = new std::mt19937_64(device()); + return (*rng)(); + } + + std::string get_tmp_dir() { + return tmp_dir_; + } + + private: + std::string compile_module_to_hsaco(std::unique_ptr &module); + uint64_t random_num_; + std::string tmp_dir_; +}; + +#endif + +std::unique_ptr create_llvm_jit_session_amdgpu( + TaichiLLVMContext *tlctx, + CompileConfig *config, + Arch arch); + +} // namespace lang +} // namespace taichi diff --git a/taichi/runtime/llvm/CMakeLists.txt b/taichi/runtime/llvm/CMakeLists.txt index 2c39493415ea2..ce72f3795063d 100644 --- a/taichi/runtime/llvm/CMakeLists.txt +++ b/taichi/runtime/llvm/CMakeLists.txt @@ -34,7 +34,7 @@ if (TI_WITH_CUDA) endif() if (TI_WITH_AMDGPU) - target_link_libraries(llvm_runtime PRIVATE ${llvm_ptx_libs}) + target_link_libraries(llvm_runtime PRIVATE ${llvm_amdgpu_libs}) target_link_libraries(llvm_runtime PRIVATE amdgpu_rhi) endif() diff --git a/tests/cpp/backends/amdgpu_device_test.cpp b/tests/cpp/backends/amdgpu_device_test.cpp index f5c629ae6968a..a5d3683c2d403 100644 --- a/tests/cpp/backends/amdgpu_device_test.cpp +++ b/tests/cpp/backends/amdgpu_device_test.cpp @@ -5,6 +5,8 @@ #include "taichi/rhi/amdgpu/amdgpu_driver.h" #include "taichi/rhi/amdgpu/amdgpu_context.h" #include "taichi/rhi/amdgpu/amdgpu_device.h" +#include "taichi/runtime/amdgpu/jit_amdgpu.h" +#include "taichi/runtime/llvm/llvm_context.h" #include "taichi/runtime/llvm/llvm_context_pass.h" #include @@ -15,6 +17,10 @@ #include #include #include +#include +#include +#include +#include #include "tests/cpp/program/test_program.h" @@ -187,6 +193,80 @@ TEST(AMDGPU, ConvertFuncParamAddressSpacePass) { } } +TEST(AMDGPU, ConvertProgramAndLaunch) { + std::string program = + "target datalayout = " + "\"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:" + "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:" + "1024-v2048:2048-n32:64-S32-A5-G1-ni:7\"\n" + "target triple = \"amdgcn-amd-amdhsa\"\n" + "define amdgpu_kernel void @runtime_add(double addrspace(1)* %0, double " + "addrspace(1)* %1, double addrspace(1)* %2) #0 {\n" + " %4 = alloca double*, align 8, addrspace(5)\n" + " %5 = addrspacecast double addrspace(1)* %2 to double*\n" + " %6 = addrspacecast double addrspace(1)* %1 to double*\n" + " %7 = addrspacecast double addrspace(1)* %0 to double*\n" + " %8 = addrspacecast double* addrspace(5)* %4 to double**\n" + " %9 = alloca double*, align 8, addrspace(5)\n" + " %10 = addrspacecast double* addrspace(5)* %9 to double**\n" + " %11 = alloca double*, align 8, addrspace(5)\n" + " %12 = addrspacecast double* addrspace(5)* %11 to double**\n" + " store double* %7, double** %8, align 8\n" + " store double* %6, double** %10, align 8\n" + " store double* %5, double** %12, align 8\n" + " %13 = load double*, double** %8, align 8\n" + " %14 = load double, double* %13, align 8\n" + " %15 = load double*, double** %10, align 8\n" + " %16 = load double, double* %15, align 8\n" + " %17 = fadd contract double %14, %16\n" + " %18 = load double*, double** %12, align 8\n" + " store double %17, double* %18, align 8\n" + " ret void\n" + "}\n"; + llvm::LLVMContext llvm_context; + llvm::SMDiagnostic diagnostic_err; + std::unique_ptr llvm_module = llvm::parseIR( + llvm::MemoryBuffer::getMemBuffer(program)->getMemBufferRef(), + diagnostic_err, llvm_context); + + // auto amdgpu_session = new JITSessionAMDGPU(new TaichiLLVMContext(new + // CompileConfig, Arch::amdgpu), new CompileConfig(), llvm::DataLayout("")); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUAsmPrinter(); + LLVMInitializeAMDGPUAsmParser(); + auto amdgpu_session = + new JITSessionAMDGPU(nullptr, new CompileConfig(), llvm::DataLayout("")); + auto amdgpu_module = amdgpu_session->add_module(std::move(llvm_module), 0); + std::vector arg_pointers; + std::vector arg_sizes; + double *args[3]; + size_t arg_size = sizeof(double); + AMDGPUDriver::get_instance().malloc((void **)&(args[0]), sizeof(double) * 3); + args[1] = args[0] + 1; + args[2] = args[0] + 2; + double a = 10.0; + double b = 7.0; + double ret; + AMDGPUDriver::get_instance().memcpy_host_to_device(args[0], &a, + sizeof(double)); + AMDGPUDriver::get_instance().memcpy_host_to_device(args[1], &b, + sizeof(double)); + arg_pointers.push_back((void *)&args[0]); + arg_pointers.push_back((void *)&args[1]); + arg_pointers.push_back((void *)&args[2]); + arg_sizes.push_back(arg_size); + arg_sizes.push_back(arg_size); + arg_sizes.push_back(arg_size); + amdgpu_module->call("runtime_add", arg_pointers, arg_sizes); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, args[2], + sizeof(double)); + EXPECT_EQ(ret, 17); + AMDGPUDriver::get_instance().mem_free(args[0]); +} + } // namespace lang } // namespace taichi #endif