Use tcgen05 as namespace for TMem ld/st (NVIDIA#4279)

zasdfgbnm · web-flow · commit 39aec1668e16 · 2025-04-19T15:01:21.000-07:00
For better consistency
diff --git a/csrc/kernel_ir.cpp b/csrc/kernel_ir.cpp
@@ -457,7 +457,7 @@ std::string Asm::utility() const {
     std::regex ld_pattern(R"(tcgen05\.ld\.sync\.aligned\.([^.]+)\.x\d+\.b32)");
     std::smatch match;
     if (std::regex_match(code, match, ld_pattern)) {
-      std::string result = "tmem::load";
+      std::string result = "tcgen05::load";
       result.append(match[1]);
       return result;
     }
@@ -466,7 +466,7 @@ std::string Asm::utility() const {
     std::regex st_pattern(R"(tcgen05\.st\.sync\.aligned\.([^.]+)\.x\d+\.b32)");
     std::smatch match;
     if (std::regex_match(code, match, st_pattern)) {
-      std::string result = "tmem::store";
+      std::string result = "tcgen05::store";
       result.append(match[1]);
       return result;
     }
diff --git a/tests/cpp/test_tmem.cpp b/tests/cpp/test_tmem.cpp
@@ -290,7 +290,7 @@ TEST_F(TMemTestCompileOnly, SetTMemDimSepPosNonTMem) {
 // But in the TMem load/store's loop domain, Ix (the ID parallelized on TIDx)
 // have extent 32. Then we will generate code like:
 //   if (threadIdx.x < 32) {
-//     tmem::load
+//     tcgen05::load
 //   }
 // For threadIdx.y == 0, it is correct. But for threadIdx.y == 1, it is wrong
 // because we are using the thread id 33-65 for the load, which is not a warp.
@@ -342,7 +342,7 @@ TEST_F(TMemTestCompileOnly, WrongStride) {
 // map is [TIDy, TIDx] = [2, 33], but in the TMem load/store's loop domain,
 // we have Iy{1}, Ix{32}. the generated code will be like:
 //   if (threadIdx.x < 32 && threadIdx.y < 1) {
-//     tmem::load
+//     tcgen05::load
 //   }
 // This is valid because we are using a whole warp for the load.
 TEST_F(TMemTest, InexactParallelType) {