diff --git a/external/assets b/external/assets index 2905391325512..150b16ad12ad5 160000 --- a/external/assets +++ b/external/assets @@ -1 +1 @@ -Subproject commit 2905391325512f58adb3f8684bafc06ef29f8e47 +Subproject commit 150b16ad12ad58a9a93b8988ded913e632a4df4f diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp index 60c0046c6befa..96b7749589d0a 100644 --- a/taichi/runtime/llvm/runtime_module/runtime.cpp +++ b/taichi/runtime/llvm/runtime_module/runtime.cpp @@ -108,7 +108,7 @@ using Ptr = uint8 *; using RuntimeContextArgType = long long; -#if ARCH_cuda +#if ARCH_cuda || ARCH_amdgpu extern "C" { void __assertfail(const char *message, @@ -783,7 +783,8 @@ void taichi_assert_format(LLVMRuntime *runtime, #if ARCH_cuda // Kill this CUDA thread. asm("exit;"); -#else +#elif ARCH_amdgpu + asm("S_ENDPGM"); // TODO: properly kill this CPU thread here, considering the containing // ThreadPool structure. @@ -1253,7 +1254,7 @@ void element_listgen_root(LLVMRuntime *runtime, auto parent_lookup_element = parent->lookup_element; auto child_get_num_elements = child->get_num_elements; auto child_from_parent_element = child->from_parent_element; -#if ARCH_cuda +#if ARCH_cuda || ARCH_amdgpu // All blocks share the only root container, which has only one child // container. // Each thread processes a subset of the child container for more parallelism. @@ -1303,7 +1304,7 @@ void element_listgen_nonroot(LLVMRuntime *runtime, auto parent_lookup_element = parent->lookup_element; auto child_get_num_elements = child->get_num_elements; auto child_from_parent_element = child->from_parent_element; -#if ARCH_cuda +#if ARCH_cuda || ARCH_amdgpu // Each block processes a slice of a parent container int i_start = block_idx(); int i_step = grid_dim(); @@ -1391,7 +1392,7 @@ void parallel_struct_for(RuntimeContext *context, int num_threads) { auto list = (context->runtime)->element_lists[snode_id]; auto list_tail = list->size(); -#if ARCH_cuda +#if ARCH_cuda || ARCH_amdgpu int i = block_idx(); // Note: CUDA requires compile-time constant local array sizes. // We use "1" here and modify it during codegen to tls_buffer_size. @@ -1599,7 +1600,7 @@ void gpu_parallel_mesh_for(RuntimeContext *context, } i32 linear_thread_idx(RuntimeContext *context) { -#if ARCH_cuda +#if ARCH_cuda || ARCH_amdgpu return block_idx() * block_dim() + thread_idx(); #else return context->cpu_thread_id; @@ -1831,6 +1832,8 @@ void taichi_printf(LLVMRuntime *runtime, const char *format, Args &&...args) { printf_helper helper; helper.push_back(std::forward(args)...); cuda_vprintf((Ptr)format, helper.ptr()); +#elif ARCH_amdgpu +// TODO: add printf for amdgpu backend #else runtime->host_printf(format, args...); #endif