diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py index 5190d56e07a83..6450f98ba675c 100644 --- a/python/taichi/lang/simt/block.py +++ b/python/taichi/lang/simt/block.py @@ -10,7 +10,7 @@ def arch_uses_spv(arch): def sync(): arch = impl.get_runtime().prog.config().arch - if arch == _ti_core.cuda: + if arch == _ti_core.cuda or arch == _ti_core.amdgpu: return impl.call_internal("block_barrier", with_runtime_context=False) if arch_uses_spv(arch): return impl.call_internal("workgroupBarrier", @@ -38,7 +38,7 @@ def thread_idx(): def global_thread_idx(): arch = impl.get_runtime().prog.config().arch - if arch == _ti_core.cuda: + if arch == _ti_core.cuda or _ti_core.amdgpu: return impl.get_runtime().compiling_callable.ast_builder( ).insert_thread_idx_expr() if arch_uses_spv(arch): diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 97ffa5ab63900..1a4f20d7451a5 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -539,6 +539,8 @@ std::unique_ptr TaichiLLVMContext::module_from_file( function_pass_manager.doFinalization(); patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); + patch_intrinsic("block_barrier", llvm::Intrinsic::amdgcn_s_barrier, + false); link_module_with_amdgpu_libdevice(module); patch_amdgpu_kernel_dim( diff --git a/tests/python/test_shared_array.py b/tests/python/test_shared_array.py index c9a9d706f36d6..5d9a285610acb 100644 --- a/tests/python/test_shared_array.py +++ b/tests/python/test_shared_array.py @@ -4,7 +4,7 @@ from tests import test_utils -@test_utils.test(arch=[ti.cuda, ti.vulkan]) +@test_utils.test(arch=[ti.cuda, ti.vulkan, ti.amdgpu]) def test_shared_array_nested_loop(): block_dim = 128 nBlocks = 64