NVIDIA · yzh119 · Jul 24, 2024 · Jul 24, 2024
diff --git a/include/cutlass/block_striped.h b/include/cutlass/block_striped.h
@@ -263,5 +263,36 @@ struct BlockStripedReduce<BlockThreads, ArrayT, half_t> :
 };
 
 
+/// Utility for performing block-striped access (load, store, reduce) of trivially-copyable,
+/// statically-sized array types to global memory.
+/// (Specialization for bfloat16_t.  Uses __nv_bfloat162 vectorized-reduction.)
+template <
+  int BlockThreads,
+  typename ArrayT>
+struct BlockStripedReduce<BlockThreads, ArrayT, bfloat16_t> :
+  BlockStriped<
+    BlockThreads,
+    ArrayT,
+    __nv_bfloat162>
+{
+  static_assert(BlockStripedReduce::kStripes % 2 == 0, "Array of bfloat16 must be even number in length");
+
+  /// Reduce
+  CUTLASS_DEVICE
+  static void reduce(ArrayT *ptr, const ArrayT &data, int thread_idx)
+  {
+    cutlass::atomic_add<__nv_bfloat162> reduce;
+    __nv_bfloat162 *access_output = reinterpret_cast<__nv_bfloat162*>(ptr);
+    const __nv_bfloat162 *access_data = reinterpret_cast<const __nv_bfloat162*>(&data);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < BlockStripedReduce::kStripes; ++i)
+    {
+      reduce(access_output + (BlockThreads * i) + thread_idx, access_data[i]);
+    }
+  }
+};
+
+
 } // namespace cutlass
 
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
@@ -626,6 +626,23 @@ struct atomic_add<half2>
   }
 };
 
+template<>
+struct atomic_add<__nv_bfloat162>
+{
+  CUTLASS_DEVICE
+  void operator()(__nv_bfloat162 *ptr, const __nv_bfloat162)
+  {
+#if !defined(__CUDA_ARCH__) || (defined(__CUDA_ARCH__)  && (__CUDA_ARCH__ < 900))
+      CUTLASS_UNUSED(ptr);
+      CUTLASS_UNUSED(data);
+#else
+    // Vector-2 bf16 atomic reduction requires .target sm_90 or higher
+    uint32_t word = reinterpret_cast<const uint32_t&>(data);
+    asm volatile ("red.gpu.global.add.noftz.bf16x2 [%0], %1;\n" : : "l"(ptr), "r"(word));
+#endif // (__CUDA_ARCH__ >= 900)
+  }
+};
+
 template <typename T>
 using red [[deprecated("use atomic_add instead")]] = atomic_add<T>;