NVIDIA · jacobhinkle · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024
diff --git a/csrc/device_lower/pass/index.cpp b/csrc/device_lower/pass/index.cpp
@@ -1686,8 +1686,8 @@ Val* hardCodedIndexGenerationForStMatrix(
   Val* out_index = nullptr;
 
   NVF_ERROR(
-      ldst->out()->dtype() == DataType::Half,
-      "we only support half type in stmatrix");
+      dataTypeSize(ldst->out()->dtype()) == 2,
+      "we only support 16-bit types in stmatrix");
 
   NVF_ERROR(ldst->out()->isA<TensorView>());
   TensorView* out_tv = ldst->out()->as<TensorView>();
@@ -1959,8 +1959,8 @@ Val* hardCodedIndexGenerationForStMatrixSwizzle(
       "size not currently supported for stmatrix");
 
   NVF_ERROR(
-      ldst->out()->dtype() == DataType::Half,
-      "we only support half type in stmatrix");
+      dataTypeSize(ldst->out()->dtype()) == 2,
+      "we only support 16-bit types in stmatrix");
 
   NVF_ERROR(ldst->out()->isA<TensorView>());
   TensorView* out_tv = ldst->out()->as<TensorView>();

diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -521,9 +521,9 @@ void HopperMultipleMatmulScheduler::scheduleEpilogue() {
 
       // Set LoadStoreOp
       // TODO: extend support when mma is not cast to half
-      NVF_ERROR(
-          dc->dtype() == DataType::Half,
-          "We support smem_epilogue on hopper only when the output of mma is cast to half");
+      NVF_CHECK(
+          dataTypeSize(dc->dtype()) == 2,
+          "We support use_smem_epilogue on Hopper only when the output is 16-bit");
 
       d_smem->definition()->as<LoadStoreOp>()->setOpType(
           LoadStoreOpType::StMatrix);

diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
@@ -1311,8 +1311,9 @@ void scheduleStMatrixForMmaOutput(
       ((tile_m == 16 && tile_n == 16) || (tile_m == 16 && tile_n == 8)),
       "We only support 16x16 and 16x16 stmatrix now");
 
-  NVF_ERROR(
-      tv->dtype() == DataType::Half, "we only support half type in stmatrix");
+  NVF_CHECK(
+      dataTypeSize(tv->dtype()) == 2,
+      "we only support 16-bit types in stmatrix");
 
   // [M, N] -> [128(TIDx), N/8 ,  2 , 2]
   auto s =

diff --git a/tests/cpp/test_memory.cpp b/tests/cpp/test_memory.cpp
@@ -2811,7 +2811,7 @@ TEST_P(LdMatrixTest, Regular) {
 
 // We get shapes M and N from MmaMacrao. The vector of ints are
 // the tile_m and tile_n factors (8x8, 16x8 and 16x16).
-using StMatrixTestParams = std::tuple<MmaMacro, std::vector<int>>;
+using StMatrixTestParams = std::tuple<MmaMacro, std::vector<int>, DataType>;
 
 class StMatrixTest : public NVFuserFixtureParamTest<StMatrixTestParams> {
  protected:
@@ -2829,6 +2829,7 @@ TEST_P(StMatrixTest, Regular) {
 
   auto macro = std::get<0>(GetParam());
   auto tile_sizes = std::get<1>(GetParam());
+  auto dtype = std::get<2>(GetParam());
   auto sizeM = getM(macro);
   auto sizeN = getN(macro);
   int64_t tile_m = tile_sizes.at(0);
@@ -2843,7 +2844,7 @@ TEST_P(StMatrixTest, Regular) {
   fusion.manage("st_matrix_m", sizeM);
   fusion.manage("st_matrix_n", sizeN);
 
-  auto tv0 = makeContigConcreteTensor({sizeM, sizeN}, DataType::Half);
+  auto tv0 = makeContigConcreteTensor({sizeM, sizeN}, dtype);
   fusion.addInput(tv0);
   // tv0 (global) -> tv1 (registers)
   auto tv1 = set(tv0);
@@ -2871,7 +2872,8 @@ TEST_P(StMatrixTest, Regular) {
   tv3->split(0, 32);
   tv3->axis(1)->parallelize(ParallelType::TIDx);
 
-  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  auto options =
+      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
   auto t0 = at::randn({sizeM, sizeN}, options);
 
   KernelExecutor ke;
@@ -2886,13 +2888,14 @@ std::string testNameStMatrixTest(
   std::ostringstream os;
   auto macro = std::get<0>(info.param);
   auto tile_sizes = std::get<1>(info.param);
+  auto dtype = std::get<2>(info.param);
   auto sizeM = getM(macro);
   auto sizeN = getN(macro);
   auto tile_m = tile_sizes.at(0);
   auto tile_n = tile_sizes.at(1);
 
   os << "m_" << sizeM << "_n_" << sizeN << "_tile_m_" << tile_m << "_tile_n_"
-     << tile_n;
+     << tile_n << "_" << mma_utils::dtypeToChar(dtype);
   return os.str();
 }
 
@@ -2904,7 +2907,8 @@ INSTANTIATE_TEST_SUITE_P(
         testing::Values(
             // tile_m, tile_n
             std::vector<int>{16, 8},
-            std::vector<int>{16, 16})),
+            std::vector<int>{16, 16}),
+        testing::Values(DataType::Half, DataType::BFloat16)),
     testNameStMatrixTest);
 
 TEST_P(LdMatrixTest, Transpose) {

diff --git a/tests/cpp/test_mma.cpp b/tests/cpp/test_mma.cpp
@@ -545,11 +545,12 @@ TEST_P(HopperRSStmatrix, SingleTileWithTMALoadStoreStMatrix) {
 
   auto cg_outputs = ke.run({inputs.first, inputs.second});
   auto tref = atMatmul(
-      inputs.first.squeeze().to(at::kFloat),
-      inputs.second.squeeze().to(at::kFloat),
-      layout);
+                  inputs.first.squeeze().to(at::kFloat),
+                  inputs.second.squeeze().to(at::kFloat),
+                  layout)
+                  .to(data_type_to_aten(dtype));
 
-  EXPECT_TRUE(at::allclose(cg_outputs[0], tref.to(at::kHalf), 1e-1, 1e-1));
+  EXPECT_TRUE(at::allclose(cg_outputs[0], tref, 1e-1, 1e-1));
 }
 
 std::string testNameHopperRS(
@@ -569,7 +570,7 @@ INSTANTIATE_TEST_SUITE_P(
     HopperRSStmatrix,
     testing::Combine(
         kAllHopperMacros,
-        testing::Values(DataType::Half),
+        testing::Values(DataType::Half, DataType::BFloat16),
         testing::Values(MmaLayout::TN, MmaLayout::TT),
         kAllSmemSwizzleModes,
         testing::Values(