diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh index 608ebdd104fbd..e9a74eed45f4d 100755 --- a/.github/workflows/scripts/unix_test.sh +++ b/.github/workflows/scripts/unix_test.sh @@ -123,7 +123,7 @@ if [ -z "$GPU_TEST" ]; then fi elif [ ! -z "$AMDGPU_TEST" ]; then run-it cpu $(nproc) - # run-it amdgpu 4 + run-it amdgpu 8 else run-it cuda 8 run-it cpu $(nproc) diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp index 6d0e8b4c465c8..d7f4631a9226c 100644 --- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp +++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp @@ -84,7 +84,8 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { } else { TI_NOT_IMPLEMENTED } - } else if (op == UnaryOpType::sgn) { + } // TODO simplify the impl of sgn + else if (op == UnaryOpType::sgn) { if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) { auto ashr = builder->CreateAShr(input, 31); auto sub = builder->CreateSub(0, input); @@ -141,6 +142,57 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { builder->SetInsertPoint(bb_merge); llvm_val[stmt] = builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast); + } else if (input_taichi_type->is_primitive(PrimitiveTypeID::f64)) { + auto func = builder->GetInsertBlock()->getParent(); + auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func); + auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else"); + auto bb_merge = BasicBlock::Create(*llvm_context, "merge"); + auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func); + auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else"); + + auto alloc = builder->CreateAlloca( + llvm::Type::getDoubleTy(*llvm_context), (unsigned)5); + auto newty = llvm::PointerType::get( + llvm::Type::getDoubleTy(*llvm_context), (unsigned)0); + auto cast = builder->CreateAddrSpaceCast(alloc, newty); + auto fcmp_oeq = builder->CreateFCmpOEQ( + input, + llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0)); + builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else); + builder->SetInsertPoint(bb_oeq_then); + builder->CreateStore( + llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0), + cast); + builder->CreateBr(bb_merge); + bb_oeq_then = builder->GetInsertBlock(); + + func->getBasicBlockList().push_back(bb_oeq_else); + builder->SetInsertPoint(bb_oeq_else); + auto fcmp_olt = builder->CreateFCmpOLT( + input, + llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0)); + builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else); + bb_oeq_else = builder->GetInsertBlock(); + + builder->SetInsertPoint(bb_olt_then); + builder->CreateStore( + llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), -1), + cast); + builder->CreateBr(bb_merge); + bb_olt_then = builder->GetInsertBlock(); + + func->getBasicBlockList().push_back(bb_olt_else); + builder->SetInsertPoint(bb_olt_else); + builder->CreateStore( + llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1), + cast); + builder->CreateBr(bb_merge); + bb_olt_else = builder->GetInsertBlock(); + + func->getBasicBlockList().push_back(bb_merge); + builder->SetInsertPoint(bb_merge); + llvm_val[stmt] = + builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast); } } UNARY_STD(cos) @@ -390,7 +442,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { llvm_val[stmt] = call("__ocml_pow_f16", {lhs, rhs}); } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) { llvm_val[stmt] = call("__ocml_pow_f32", {lhs, rhs}); - } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) { + } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) { llvm_val[stmt] = call("__ocml_pow_f64", {lhs, rhs}); } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i32)) { auto sitofp_lhs_ = @@ -408,7 +460,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM { llvm_val[stmt] = call("__ocml_atan2_f16", {lhs, rhs}); } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) { llvm_val[stmt] = call("__ocml_atan2_f32", {lhs, rhs}); - } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) { + } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) { llvm_val[stmt] = call("__ocml_atan2_f64", {lhs, rhs}); } else { TI_NOT_IMPLEMENTED diff --git a/taichi/program/extension.cpp b/taichi/program/extension.cpp index fc4a2d613f5d0..72612221f9070 100644 --- a/taichi/program/extension.cpp +++ b/taichi/program/extension.cpp @@ -19,6 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) { {Extension::sparse, Extension::quant, Extension::quant_basic, Extension::data64, Extension::adstack, Extension::bls, Extension::assertion, Extension::mesh}}, + {Arch::amdgpu, {Extension::assertion}}, {Arch::metal, {}}, {Arch::opengl, {Extension::extfunc}}, {Arch::gles, {}}, diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 14f103bea3e6a..b0d785b8b6872 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -620,8 +620,8 @@ void TaichiLLVMContext::link_module_with_amdgpu_libdevice( } for (auto &f : libdevice_module->functions()) { - auto func_ = module->getFunction(f.getName()); - if (!func_ && starts_with(f.getName().lower(), "__" + libdevice)) + auto func_name = libdevice.substr(0, libdevice.length() - 3); + if (starts_with(f.getName().lower(), "__" + func_name)) f.setLinkage(llvm::Function::CommonLinkage); } diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp index 96b7749589d0a..21711966f1975 100644 --- a/taichi/runtime/llvm/runtime_module/runtime.cpp +++ b/taichi/runtime/llvm/runtime_module/runtime.cpp @@ -760,8 +760,15 @@ void taichi_assert_format(LLVMRuntime *runtime, const char *format, int num_arguments, uint64 *arguments) { +#ifdef ARCH_amdgpu + // TODO: find out why error with mark_force_no_inline + // llvm::SDValue llvm::SelectionDAG::getNode(unsigned int, const llvm::SDLoc + // &, llvm::EVT, llvm::SDValue, const llvm::SDNodeFlags): Assertion + // `VT.getSizeInBits() == Operand.getValueSizeInBits() && "Cannot BITCAST + // between types of different sizes!"' failed. +#else mark_force_no_inline(); - +#endif if (!enable_assert || test != 0) return; if (!runtime->error_code) { @@ -1510,7 +1517,13 @@ void gpu_parallel_range_for(RuntimeContext *context, range_for_xlogue epilogue, const std::size_t tls_size) { int idx = thread_idx() + block_dim() * block_idx() + begin; +#ifdef ARCH_amdgpu + // AMDGPU doesn't support dynamic array + // TODO: find a better way to set the tls_size (maybe like struct_for + alignas(8) char tls_buffer[64]; +#else alignas(8) char tls_buffer[tls_size]; +#endif auto tls_ptr = &tls_buffer[0]; if (prologue) prologue(context, tls_ptr); @@ -1588,7 +1601,13 @@ void gpu_parallel_mesh_for(RuntimeContext *context, MeshForTaskFunc *func, mesh_for_xlogue epilogue, const std::size_t tls_size) { +#ifdef ARCH_amdgpu + // AMDGPU doesn't support dynamic array + // TODO: find a better way to set the tls_size (maybe like struct_for + alignas(8) char tls_buffer[64]; +#else alignas(8) char tls_buffer[tls_size]; +#endif auto tls_ptr = &tls_buffer[0]; for (int idx = block_idx(); idx < num_patches; idx += grid_dim()) { if (prologue) diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py index d30bad5834d61..bf45438ee2790 100644 --- a/tests/python/test_ad_gdar_diffmpm.py +++ b/tests/python/test_ad_gdar_diffmpm.py @@ -4,7 +4,10 @@ from tests import test_utils -@test_utils.test(require=ti.extension.assertion, debug=True, exclude=[ti.cc]) +# FIXME: gdar mpm on amdgpu backend(assign gale) +@test_utils.test(require=ti.extension.assertion, + debug=True, + exclude=[ti.cc, ti.amdgpu]) def test_gdar_mpm(): real = ti.f32 diff --git a/tests/python/test_ad_if.py b/tests/python/test_ad_if.py index 73f436cd1c01c..5a7358a5b67ca 100644 --- a/tests/python/test_ad_if.py +++ b/tests/python/test_ad_if.py @@ -244,7 +244,8 @@ def func(): func() -@test_utils.test() +#FIXME: amdgpu backend(assign gale) +@test_utils.test(exclude=ti.amdgpu) def test_if_condition_depend_on_for_loop_index(): scalar = lambda: ti.field(dtype=ti.f32) vec = lambda: ti.Vector.field(3, dtype=ti.f32) diff --git a/tests/python/test_cfg_continue.py b/tests/python/test_cfg_continue.py index 2fb104bda4160..bb683da33756a 100644 --- a/tests/python/test_cfg_continue.py +++ b/tests/python/test_cfg_continue.py @@ -2,7 +2,7 @@ from tests import test_utils -@test_utils.test() +@test_utils.test(exclude=[ti.amdgpu]) def test_cfg_continue(): x = ti.field(dtype=int, shape=1) state = ti.field(dtype=int, shape=1) @@ -13,7 +13,6 @@ def foo(): if state[p] == 0: x[p] = 1 continue - if state[p] != 0: print('test') diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index 9e0e17fcf5f67..1df17b1e43e9c 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -213,7 +213,9 @@ def test_cli_run(): def test_cli_cache(): - archs = {ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.gles} + archs = { + ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.gles, ti.amdgpu + } archs = {v for v in archs if v in test_utils.expected_archs()} exts = ('ll', 'bc', 'spv', 'metal', 'tcb', 'lock') tmp_path = tempfile.mkdtemp() diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py index 763be5c9c49fc..6a280667474fd 100644 --- a/tests/python/test_debug.py +++ b/tests/python/test_debug.py @@ -66,7 +66,7 @@ def func(): func() -@test_utils.test(require=ti.extension.assertion, +@test_utils.test(require=[ti.extension.sparse, ti.extension.assertion], debug=True, gdb_trigger=False, exclude=ti.metal) @@ -83,7 +83,7 @@ def func(): func() -@test_utils.test(require=ti.extension.assertion, +@test_utils.test(require=[ti.extension.sparse, ti.extension.assertion], debug=True, gdb_trigger=False, exclude=ti.metal) diff --git a/tests/python/test_internal_func.py b/tests/python/test_internal_func.py index e691022ed386d..b873975a0b3e8 100644 --- a/tests/python/test_internal_func.py +++ b/tests/python/test_internal_func.py @@ -6,8 +6,9 @@ from tests import test_utils -@test_utils.test( - exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc]) +@test_utils.test(exclude=[ + ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu +]) def test_basic(): @ti.kernel def test(): @@ -17,8 +18,9 @@ def test(): test() -@test_utils.test( - exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc]) +@test_utils.test(exclude=[ + ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu +]) def test_host_polling(): return @@ -32,8 +34,9 @@ def test(): time.sleep(0.1) -@test_utils.test( - exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc]) +@test_utils.test(exclude=[ + ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu +]) def test_list_manager(): @ti.kernel def test(): @@ -43,8 +46,9 @@ def test(): test() -@test_utils.test( - exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc]) +@test_utils.test(exclude=[ + ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu +]) def test_node_manager(): @ti.kernel def test(): @@ -54,8 +58,9 @@ def test(): test() -@test_utils.test( - exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc]) +@test_utils.test(exclude=[ + ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu +]) def test_node_manager_gc(): @ti.kernel def test_cpu(): @@ -64,7 +69,7 @@ def test_cpu(): test_cpu() -@test_utils.test(arch=[ti.cpu, ti.cuda], debug=True) +@test_utils.test(arch=[ti.cpu, ti.cuda, ti.amdgpu], debug=True) def test_return(): @ti.kernel def test_cpu(): diff --git a/tests/python/test_lang.py b/tests/python/test_lang.py index 1ce3dcde85b23..5c0c1057755f9 100644 --- a/tests/python/test_lang.py +++ b/tests/python/test_lang.py @@ -94,7 +94,7 @@ def test(): test() -@test_utils.test() +@test_utils.test(exclude=[ti.amdgpu]) def test_local_atomics(): n = 32 val = ti.field(ti.i32, shape=n) diff --git a/tests/python/test_native_functions.py b/tests/python/test_native_functions.py index aad7c12a60efd..3c992ec1ad383 100644 --- a/tests/python/test_native_functions.py +++ b/tests/python/test_native_functions.py @@ -5,7 +5,7 @@ from tests import test_utils -@test_utils.test() +@test_utils.test(exclude=[ti.amdgpu]) def test_abs(): x = ti.field(ti.f32) diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index 42c1f7d921f07..43794649d4a73 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -19,7 +19,7 @@ vector_dims = [3] matrix_dims = [(1, 2), (2, 3)] supported_archs_taichi_ndarray = [ - ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal + ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.amdgpu ] diff --git a/tests/python/test_ndrange.py b/tests/python/test_ndrange.py index 94c107d81c4d4..d74405824e3a8 100644 --- a/tests/python/test_ndrange.py +++ b/tests/python/test_ndrange.py @@ -317,7 +317,7 @@ def example(): example() -@test_utils.test() +@test_utils.test(exclude=[ti.amdgpu]) def test_n_loop_var_neq_dimension(): @ti.kernel def iter(): diff --git a/tests/python/test_non_taichi_types_in_kernel.py b/tests/python/test_non_taichi_types_in_kernel.py index 96ed9032e544d..ddb3957375b5b 100644 --- a/tests/python/test_non_taichi_types_in_kernel.py +++ b/tests/python/test_non_taichi_types_in_kernel.py @@ -2,7 +2,7 @@ from tests import test_utils -@test_utils.test() +@test_utils.test(exclude=ti.amdgpu) def test_subscript_user_classes_in_kernel(): class MyList: def __init__(self, elements): diff --git a/tests/python/test_offload_cross.py b/tests/python/test_offload_cross.py index 4a2b6d313f34f..eb5e475036a8a 100644 --- a/tests/python/test_offload_cross.py +++ b/tests/python/test_offload_cross.py @@ -111,7 +111,7 @@ def ker(): assert ret[None] == 46 -@test_utils.test() +@test_utils.test(exclude=ti.amdgpu) def test_offload_with_cross_nested_for(): @ti.kernel def run(a: ti.i32): @@ -123,7 +123,7 @@ def run(a: ti.i32): run(2) -@test_utils.test() +@test_utils.test(exclude=ti.amdgpu) def test_offload_with_cross_if_inside_for(): @ti.kernel def run(a: ti.i32): diff --git a/tests/python/test_print.py b/tests/python/test_print.py index 4a57cef29c6bf..73c1a87e9d8e2 100644 --- a/tests/python/test_print.py +++ b/tests/python/test_print.py @@ -29,7 +29,7 @@ def func(): # TODO: As described by @k-ye above, what we want to ensure # is that, the content shows on console is *correct*. -@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True) +@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True) def test_multi_print(): @ti.kernel def func(x: ti.i32, y: ti.f32): @@ -40,7 +40,7 @@ def func(x: ti.i32, y: ti.f32): # TODO: vulkan doesn't support %s but we should ignore it instead of crashing. -@test_utils.test(exclude=[ti.vulkan, ti.dx11]) +@test_utils.test(exclude=[ti.vulkan, ti.dx11, ti.amdgpu]) def test_print_string(): @ti.kernel def func(x: ti.i32, y: ti.f32): @@ -52,7 +52,7 @@ def func(x: ti.i32, y: ti.f32): ti.sync() -@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True) +@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True) def test_print_matrix(): x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=()) y = ti.Vector.field(3, dtype=ti.f32, shape=3) @@ -68,7 +68,7 @@ def func(k: ti.f32): ti.sync() -@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True) +@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True) def test_print_sep_end(): @ti.kernel def func(): @@ -88,7 +88,7 @@ def func(): ti.sync() -@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True) +@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True) def test_print_multiple_threads(): x = ti.field(dtype=ti.f32, shape=(128, )) @@ -104,7 +104,7 @@ def func(k: ti.f32): ti.sync() -@test_utils.test(exclude=[ti.cc, ti.dx11, vk_on_mac], debug=True) +@test_utils.test(exclude=[ti.cc, ti.dx11, vk_on_mac, ti.amdgpu], debug=True) def test_print_list(): x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3)) y = ti.Vector.field(3, dtype=ti.f32, shape=()) @@ -125,7 +125,9 @@ def func(k: ti.f32): ti.sync() -@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True) +@test_utils.test(arch=[ti.cpu, ti.vulkan], + exclude=[vk_on_mac, ti.amdgpu], + debug=True) def test_python_scope_print_field(): x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=()) y = ti.Vector.field(3, dtype=ti.f32, shape=3) @@ -136,7 +138,9 @@ def test_python_scope_print_field(): print(z) -@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True) +@test_utils.test(arch=[ti.cpu, ti.vulkan], + exclude=[vk_on_mac, ti.amdgpu], + debug=True) def test_print_string_format(): @ti.kernel def func(k: ti.f32): @@ -152,7 +156,9 @@ def func(k: ti.f32): ti.sync() -@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True) +@test_utils.test(arch=[ti.cpu, ti.vulkan], + exclude=[vk_on_mac, ti.amdgpu], + debug=True) def test_print_fstring(): def foo1(x): return x + 1 @@ -166,7 +172,7 @@ def func(i: ti.i32, f: ti.f32): @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan], - exclude=[vk_on_mac], + exclude=[vk_on_mac, ti.amdgpu], debug=True) def test_print_u64(): @ti.kernel @@ -178,7 +184,7 @@ def func(i: ti.u64): @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan], - exclude=[vk_on_mac], + exclude=[vk_on_mac, ti.amdgpu], debug=True) def test_print_i64(): @ti.kernel @@ -190,7 +196,7 @@ def func(i: ti.i64): @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan], - exclude=[vk_on_mac, cuda_on_windows], + exclude=[vk_on_mac, cuda_on_windows, ti.amdgpu], debug=True) def test_print_seq(capfd): @ti.kernel diff --git a/tests/python/test_struct_for_dynamic.py b/tests/python/test_struct_for_dynamic.py index daf62f0426aa0..d179a3aa051d7 100644 --- a/tests/python/test_struct_for_dynamic.py +++ b/tests/python/test_struct_for_dynamic.py @@ -2,7 +2,8 @@ from tests import test_utils -@test_utils.test(exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal]) +@test_utils.test(require=ti.extension.sparse, + exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal]) def test_dynamic(): x = ti.field(ti.i32) y = ti.field(ti.i32, shape=()) @@ -23,7 +24,8 @@ def count(): assert y[None] == n // 3 + 1 -@test_utils.test(exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal]) +@test_utils.test(require=ti.extension.sparse, + exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal]) def test_dense_dynamic(): n = 128