diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index 608ebdd104fbd..e9a74eed45f4d 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -123,7 +123,7 @@ if [ -z "$GPU_TEST" ]; then
     fi
 elif [ ! -z "$AMDGPU_TEST" ]; then
     run-it cpu    $(nproc)
-    # run-it amdgpu 4
+    run-it amdgpu 8
 else
     run-it cuda   8
     run-it cpu    $(nproc)
diff --git a/taichi/codegen/amdgpu/codegen_amdgpu.cpp b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
index 6d0e8b4c465c8..d7f4631a9226c 100644
--- a/taichi/codegen/amdgpu/codegen_amdgpu.cpp
+++ b/taichi/codegen/amdgpu/codegen_amdgpu.cpp
@@ -84,7 +84,8 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
       } else {
         TI_NOT_IMPLEMENTED
       }
-    } else if (op == UnaryOpType::sgn) {
+    }  // TODO simplify the impl of sgn
+    else if (op == UnaryOpType::sgn) {
       if (input_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
         auto ashr = builder->CreateAShr(input, 31);
         auto sub = builder->CreateSub(0, input);
@@ -141,6 +142,57 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         builder->SetInsertPoint(bb_merge);
         llvm_val[stmt] =
             builder->CreateLoad(llvm::Type::getFloatTy(*llvm_context), cast);
+      } else if (input_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
+        auto func = builder->GetInsertBlock()->getParent();
+        auto bb_oeq_then = BasicBlock::Create(*llvm_context, "oeq_then", func);
+        auto bb_oeq_else = BasicBlock::Create(*llvm_context, "oeq_else");
+        auto bb_merge = BasicBlock::Create(*llvm_context, "merge");
+        auto bb_olt_then = BasicBlock::Create(*llvm_context, "olt_then", func);
+        auto bb_olt_else = BasicBlock::Create(*llvm_context, "olt_else");
+
+        auto alloc = builder->CreateAlloca(
+            llvm::Type::getDoubleTy(*llvm_context), (unsigned)5);
+        auto newty = llvm::PointerType::get(
+            llvm::Type::getDoubleTy(*llvm_context), (unsigned)0);
+        auto cast = builder->CreateAddrSpaceCast(alloc, newty);
+        auto fcmp_oeq = builder->CreateFCmpOEQ(
+            input,
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
+        builder->CreateCondBr(fcmp_oeq, bb_oeq_then, bb_oeq_else);
+        builder->SetInsertPoint(bb_oeq_then);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_oeq_then = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_oeq_else);
+        builder->SetInsertPoint(bb_oeq_else);
+        auto fcmp_olt = builder->CreateFCmpOLT(
+            input,
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 0));
+        builder->CreateCondBr(fcmp_olt, bb_olt_then, bb_olt_else);
+        bb_oeq_else = builder->GetInsertBlock();
+
+        builder->SetInsertPoint(bb_olt_then);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), -1),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_olt_then = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_olt_else);
+        builder->SetInsertPoint(bb_olt_else);
+        builder->CreateStore(
+            llvm::ConstantFP::get(llvm::Type::getDoubleTy(*llvm_context), 1),
+            cast);
+        builder->CreateBr(bb_merge);
+        bb_olt_else = builder->GetInsertBlock();
+
+        func->getBasicBlockList().push_back(bb_merge);
+        builder->SetInsertPoint(bb_merge);
+        llvm_val[stmt] =
+            builder->CreateLoad(llvm::Type::getDoubleTy(*llvm_context), cast);
       }
     }
     UNARY_STD(cos)
@@ -390,7 +442,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         llvm_val[stmt] = call("__ocml_pow_f16", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
         llvm_val[stmt] = call("__ocml_pow_f32", {lhs, rhs});
-      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
+      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
         llvm_val[stmt] = call("__ocml_pow_f64", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i32)) {
         auto sitofp_lhs_ =
@@ -408,7 +460,7 @@ class TaskCodeGenAMDGPU : public TaskCodeGenLLVM {
         llvm_val[stmt] = call("__ocml_atan2_f16", {lhs, rhs});
       } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f32)) {
         llvm_val[stmt] = call("__ocml_atan2_f32", {lhs, rhs});
-      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::i64)) {
+      } else if (ret_taichi_type->is_primitive(PrimitiveTypeID::f64)) {
         llvm_val[stmt] = call("__ocml_atan2_f64", {lhs, rhs});
       } else {
         TI_NOT_IMPLEMENTED
diff --git a/taichi/program/extension.cpp b/taichi/program/extension.cpp
index fc4a2d613f5d0..72612221f9070 100644
--- a/taichi/program/extension.cpp
+++ b/taichi/program/extension.cpp
@@ -19,6 +19,7 @@ bool is_extension_supported(Arch arch, Extension ext) {
        {Extension::sparse, Extension::quant, Extension::quant_basic,
         Extension::data64, Extension::adstack, Extension::bls,
         Extension::assertion, Extension::mesh}},
+      {Arch::amdgpu, {Extension::assertion}},
       {Arch::metal, {}},
       {Arch::opengl, {Extension::extfunc}},
       {Arch::gles, {}},
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 14f103bea3e6a..b0d785b8b6872 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -620,8 +620,8 @@ void TaichiLLVMContext::link_module_with_amdgpu_libdevice(
     }
 
     for (auto &f : libdevice_module->functions()) {
-      auto func_ = module->getFunction(f.getName());
-      if (!func_ && starts_with(f.getName().lower(), "__" + libdevice))
+      auto func_name = libdevice.substr(0, libdevice.length() - 3);
+      if (starts_with(f.getName().lower(), "__" + func_name))
         f.setLinkage(llvm::Function::CommonLinkage);
     }
 
diff --git a/taichi/runtime/llvm/runtime_module/runtime.cpp b/taichi/runtime/llvm/runtime_module/runtime.cpp
index 96b7749589d0a..21711966f1975 100644
--- a/taichi/runtime/llvm/runtime_module/runtime.cpp
+++ b/taichi/runtime/llvm/runtime_module/runtime.cpp
@@ -760,8 +760,15 @@ void taichi_assert_format(LLVMRuntime *runtime,
                           const char *format,
                           int num_arguments,
                           uint64 *arguments) {
+#ifdef ARCH_amdgpu
+  // TODO: find out why error with mark_force_no_inline
+  //  llvm::SDValue llvm::SelectionDAG::getNode(unsigned int, const llvm::SDLoc
+  //  &, llvm::EVT, llvm::SDValue, const llvm::SDNodeFlags): Assertion
+  //  `VT.getSizeInBits() == Operand.getValueSizeInBits() && "Cannot BITCAST
+  //  between types of different sizes!"' failed.
+#else
   mark_force_no_inline();
-
+#endif
   if (!enable_assert || test != 0)
     return;
   if (!runtime->error_code) {
@@ -1510,7 +1517,13 @@ void gpu_parallel_range_for(RuntimeContext *context,
                             range_for_xlogue epilogue,
                             const std::size_t tls_size) {
   int idx = thread_idx() + block_dim() * block_idx() + begin;
+#ifdef ARCH_amdgpu
+  // AMDGPU doesn't support dynamic array
+  // TODO: find a better way to set the tls_size (maybe like struct_for
+  alignas(8) char tls_buffer[64];
+#else
   alignas(8) char tls_buffer[tls_size];
+#endif
   auto tls_ptr = &tls_buffer[0];
   if (prologue)
     prologue(context, tls_ptr);
@@ -1588,7 +1601,13 @@ void gpu_parallel_mesh_for(RuntimeContext *context,
                            MeshForTaskFunc *func,
                            mesh_for_xlogue epilogue,
                            const std::size_t tls_size) {
+#ifdef ARCH_amdgpu
+  // AMDGPU doesn't support dynamic array
+  // TODO: find a better way to set the tls_size (maybe like struct_for
+  alignas(8) char tls_buffer[64];
+#else
   alignas(8) char tls_buffer[tls_size];
+#endif
   auto tls_ptr = &tls_buffer[0];
   for (int idx = block_idx(); idx < num_patches; idx += grid_dim()) {
     if (prologue)
diff --git a/tests/python/test_ad_gdar_diffmpm.py b/tests/python/test_ad_gdar_diffmpm.py
index d30bad5834d61..bf45438ee2790 100644
--- a/tests/python/test_ad_gdar_diffmpm.py
+++ b/tests/python/test_ad_gdar_diffmpm.py
@@ -4,7 +4,10 @@
 from tests import test_utils
 
 
-@test_utils.test(require=ti.extension.assertion, debug=True, exclude=[ti.cc])
+# FIXME: gdar mpm on amdgpu backend(assign gale)
+@test_utils.test(require=ti.extension.assertion,
+                 debug=True,
+                 exclude=[ti.cc, ti.amdgpu])
 def test_gdar_mpm():
     real = ti.f32
 
diff --git a/tests/python/test_ad_if.py b/tests/python/test_ad_if.py
index 73f436cd1c01c..5a7358a5b67ca 100644
--- a/tests/python/test_ad_if.py
+++ b/tests/python/test_ad_if.py
@@ -244,7 +244,8 @@ def func():
     func()
 
 
-@test_utils.test()
+#FIXME: amdgpu backend(assign gale)
+@test_utils.test(exclude=ti.amdgpu)
 def test_if_condition_depend_on_for_loop_index():
     scalar = lambda: ti.field(dtype=ti.f32)
     vec = lambda: ti.Vector.field(3, dtype=ti.f32)
diff --git a/tests/python/test_cfg_continue.py b/tests/python/test_cfg_continue.py
index 2fb104bda4160..bb683da33756a 100644
--- a/tests/python/test_cfg_continue.py
+++ b/tests/python/test_cfg_continue.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test()
+@test_utils.test(exclude=[ti.amdgpu])
 def test_cfg_continue():
     x = ti.field(dtype=int, shape=1)
     state = ti.field(dtype=int, shape=1)
@@ -13,7 +13,6 @@ def foo():
             if state[p] == 0:
                 x[p] = 1
                 continue
-
             if state[p] != 0:
                 print('test')
 
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
index 9e0e17fcf5f67..1df17b1e43e9c 100644
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -213,7 +213,9 @@ def test_cli_run():
 
 
 def test_cli_cache():
-    archs = {ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.gles}
+    archs = {
+        ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.gles, ti.amdgpu
+    }
     archs = {v for v in archs if v in test_utils.expected_archs()}
     exts = ('ll', 'bc', 'spv', 'metal', 'tcb', 'lock')
     tmp_path = tempfile.mkdtemp()
diff --git a/tests/python/test_debug.py b/tests/python/test_debug.py
index 763be5c9c49fc..6a280667474fd 100644
--- a/tests/python/test_debug.py
+++ b/tests/python/test_debug.py
@@ -66,7 +66,7 @@ def func():
     func()
 
 
-@test_utils.test(require=ti.extension.assertion,
+@test_utils.test(require=[ti.extension.sparse, ti.extension.assertion],
                  debug=True,
                  gdb_trigger=False,
                  exclude=ti.metal)
@@ -83,7 +83,7 @@ def func():
         func()
 
 
-@test_utils.test(require=ti.extension.assertion,
+@test_utils.test(require=[ti.extension.sparse, ti.extension.assertion],
                  debug=True,
                  gdb_trigger=False,
                  exclude=ti.metal)
diff --git a/tests/python/test_internal_func.py b/tests/python/test_internal_func.py
index e691022ed386d..b873975a0b3e8 100644
--- a/tests/python/test_internal_func.py
+++ b/tests/python/test_internal_func.py
@@ -6,8 +6,9 @@
 from tests import test_utils
 
 
-@test_utils.test(
-    exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc])
+@test_utils.test(exclude=[
+    ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu
+])
 def test_basic():
     @ti.kernel
     def test():
@@ -17,8 +18,9 @@ def test():
     test()
 
 
-@test_utils.test(
-    exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc])
+@test_utils.test(exclude=[
+    ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu
+])
 def test_host_polling():
     return
 
@@ -32,8 +34,9 @@ def test():
         time.sleep(0.1)
 
 
-@test_utils.test(
-    exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc])
+@test_utils.test(exclude=[
+    ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu
+])
 def test_list_manager():
     @ti.kernel
     def test():
@@ -43,8 +46,9 @@ def test():
     test()
 
 
-@test_utils.test(
-    exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc])
+@test_utils.test(exclude=[
+    ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu
+])
 def test_node_manager():
     @ti.kernel
     def test():
@@ -54,8 +58,9 @@ def test():
     test()
 
 
-@test_utils.test(
-    exclude=[ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc])
+@test_utils.test(exclude=[
+    ti.metal, ti.opengl, ti.gles, ti.cuda, ti.vulkan, ti.cc, ti.amdgpu
+])
 def test_node_manager_gc():
     @ti.kernel
     def test_cpu():
@@ -64,7 +69,7 @@ def test_cpu():
     test_cpu()
 
 
-@test_utils.test(arch=[ti.cpu, ti.cuda], debug=True)
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.amdgpu], debug=True)
 def test_return():
     @ti.kernel
     def test_cpu():
diff --git a/tests/python/test_lang.py b/tests/python/test_lang.py
index 1ce3dcde85b23..5c0c1057755f9 100644
--- a/tests/python/test_lang.py
+++ b/tests/python/test_lang.py
@@ -94,7 +94,7 @@ def test():
     test()
 
 
-@test_utils.test()
+@test_utils.test(exclude=[ti.amdgpu])
 def test_local_atomics():
     n = 32
     val = ti.field(ti.i32, shape=n)
diff --git a/tests/python/test_native_functions.py b/tests/python/test_native_functions.py
index aad7c12a60efd..3c992ec1ad383 100644
--- a/tests/python/test_native_functions.py
+++ b/tests/python/test_native_functions.py
@@ -5,7 +5,7 @@
 from tests import test_utils
 
 
-@test_utils.test()
+@test_utils.test(exclude=[ti.amdgpu])
 def test_abs():
     x = ti.field(ti.f32)
 
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 42c1f7d921f07..43794649d4a73 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -19,7 +19,7 @@
 vector_dims = [3]
 matrix_dims = [(1, 2), (2, 3)]
 supported_archs_taichi_ndarray = [
-    ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal
+    ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal, ti.amdgpu
 ]
 
 
diff --git a/tests/python/test_ndrange.py b/tests/python/test_ndrange.py
index 94c107d81c4d4..d74405824e3a8 100644
--- a/tests/python/test_ndrange.py
+++ b/tests/python/test_ndrange.py
@@ -317,7 +317,7 @@ def example():
     example()
 
 
-@test_utils.test()
+@test_utils.test(exclude=[ti.amdgpu])
 def test_n_loop_var_neq_dimension():
     @ti.kernel
     def iter():
diff --git a/tests/python/test_non_taichi_types_in_kernel.py b/tests/python/test_non_taichi_types_in_kernel.py
index 96ed9032e544d..ddb3957375b5b 100644
--- a/tests/python/test_non_taichi_types_in_kernel.py
+++ b/tests/python/test_non_taichi_types_in_kernel.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test()
+@test_utils.test(exclude=ti.amdgpu)
 def test_subscript_user_classes_in_kernel():
     class MyList:
         def __init__(self, elements):
diff --git a/tests/python/test_offload_cross.py b/tests/python/test_offload_cross.py
index 4a2b6d313f34f..eb5e475036a8a 100644
--- a/tests/python/test_offload_cross.py
+++ b/tests/python/test_offload_cross.py
@@ -111,7 +111,7 @@ def ker():
     assert ret[None] == 46
 
 
-@test_utils.test()
+@test_utils.test(exclude=ti.amdgpu)
 def test_offload_with_cross_nested_for():
     @ti.kernel
     def run(a: ti.i32):
@@ -123,7 +123,7 @@ def run(a: ti.i32):
     run(2)
 
 
-@test_utils.test()
+@test_utils.test(exclude=ti.amdgpu)
 def test_offload_with_cross_if_inside_for():
     @ti.kernel
     def run(a: ti.i32):
diff --git a/tests/python/test_print.py b/tests/python/test_print.py
index 4a57cef29c6bf..73c1a87e9d8e2 100644
--- a/tests/python/test_print.py
+++ b/tests/python/test_print.py
@@ -29,7 +29,7 @@ def func():
 
 # TODO: As described by @k-ye above, what we want to ensure
 #       is that, the content shows on console is *correct*.
-@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
+@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True)
 def test_multi_print():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -40,7 +40,7 @@ def func(x: ti.i32, y: ti.f32):
 
 
 # TODO: vulkan doesn't support %s but we should ignore it instead of crashing.
-@test_utils.test(exclude=[ti.vulkan, ti.dx11])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11, ti.amdgpu])
 def test_print_string():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -52,7 +52,7 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
+@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True)
 def test_print_matrix():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -68,7 +68,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
+@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True)
 def test_print_sep_end():
     @ti.kernel
     def func():
@@ -88,7 +88,7 @@ def func():
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.dx11, vk_on_mac], debug=True)
+@test_utils.test(exclude=[ti.dx11, vk_on_mac, ti.amdgpu], debug=True)
 def test_print_multiple_threads():
     x = ti.field(dtype=ti.f32, shape=(128, ))
 
@@ -104,7 +104,7 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.cc, ti.dx11, vk_on_mac], debug=True)
+@test_utils.test(exclude=[ti.cc, ti.dx11, vk_on_mac, ti.amdgpu], debug=True)
 def test_print_list():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
     y = ti.Vector.field(3, dtype=ti.f32, shape=())
@@ -125,7 +125,9 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
+@test_utils.test(arch=[ti.cpu, ti.vulkan],
+                 exclude=[vk_on_mac, ti.amdgpu],
+                 debug=True)
 def test_python_scope_print_field():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -136,7 +138,9 @@ def test_python_scope_print_field():
     print(z)
 
 
-@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
+@test_utils.test(arch=[ti.cpu, ti.vulkan],
+                 exclude=[vk_on_mac, ti.amdgpu],
+                 debug=True)
 def test_print_string_format():
     @ti.kernel
     def func(k: ti.f32):
@@ -152,7 +156,9 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(arch=[ti.cpu, ti.vulkan], exclude=[vk_on_mac], debug=True)
+@test_utils.test(arch=[ti.cpu, ti.vulkan],
+                 exclude=[vk_on_mac, ti.amdgpu],
+                 debug=True)
 def test_print_fstring():
     def foo1(x):
         return x + 1
@@ -166,7 +172,7 @@ def func(i: ti.i32, f: ti.f32):
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan],
-                 exclude=[vk_on_mac],
+                 exclude=[vk_on_mac, ti.amdgpu],
                  debug=True)
 def test_print_u64():
     @ti.kernel
@@ -178,7 +184,7 @@ def func(i: ti.u64):
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan],
-                 exclude=[vk_on_mac],
+                 exclude=[vk_on_mac, ti.amdgpu],
                  debug=True)
 def test_print_i64():
     @ti.kernel
@@ -190,7 +196,7 @@ def func(i: ti.i64):
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan],
-                 exclude=[vk_on_mac, cuda_on_windows],
+                 exclude=[vk_on_mac, cuda_on_windows, ti.amdgpu],
                  debug=True)
 def test_print_seq(capfd):
     @ti.kernel
diff --git a/tests/python/test_struct_for_dynamic.py b/tests/python/test_struct_for_dynamic.py
index daf62f0426aa0..d179a3aa051d7 100644
--- a/tests/python/test_struct_for_dynamic.py
+++ b/tests/python/test_struct_for_dynamic.py
@@ -2,7 +2,8 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal])
+@test_utils.test(require=ti.extension.sparse,
+                 exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal])
 def test_dynamic():
     x = ti.field(ti.i32)
     y = ti.field(ti.i32, shape=())
@@ -23,7 +24,8 @@ def count():
     assert y[None] == n // 3 + 1
 
 
-@test_utils.test(exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal])
+@test_utils.test(require=ti.extension.sparse,
+                 exclude=[ti.opengl, ti.gles, ti.cc, ti.vulkan, ti.metal])
 def test_dense_dynamic():
     n = 128