Limit unrolling of all circular buffered loops to depth equal to pref…

…etch (#3627) Currently for dynamic shapes with circular buffered loops we unroll the following loops to different depths: - epilogue: stages - 1 supposedly, but often specified as `#pragma unroll` probably due to use of `ensureStaticIndexing` in the indexing pass since this loop always has constant extent. - main loop: unrolled as `#pragma unroll stages` - prologue: fully unrolled `#pragma unroll` similar to epilogue. This PR unrolls each of these loops explicitly by `#pragma prefetch` where prefetch is the circular buffering prefetch distance which is usually set to `stages - 1`. ### Motivation When using static shapes like in Fusions we receive from Thunder, I noticed that our matmul main loops are being fully unrolled (at least this is requested but the compiler likely does not fully unroll). For example I have seen this: ```c++ #pragma unroll for(nvfuser_index_t i68 = 0; i68 < 160; ++i68) ``` This particular kernel took 35 _seconds_ to compile. After this change, we will instead do the following: ```c++ #pragma unroll 3 for(nvfuser_index_t i68 = 0; i68 < 160; ++i68) ``` and the compile time is under 400 ms with no change to kernel runtime.
NVIDIA · Dec 24, 2024 · e214d37 · e214d37
1 parent 6143a6b
commit e214d37
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 19 deletions.
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -3026,17 +3026,22 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
     } else {
       step_code << gen_index << " += " << gen_step;
     }
-    if (loop->isUnrolled()) {
-      indent() << "#pragma unroll\n";
-    } else if (
-        loop->circularBufferLoopStage() == CircularBufferLoopStage::Epilog) {
-      indent() << "#pragma unroll " << loop->circularBufferLoopStageDepth() - 1
-               << "\n";
-    } else if (
-        loop->circularBufferLoopStage() !=
+    if (loop->circularBufferLoopStage() !=
         CircularBufferLoopStage::NotApplicable) {
-      indent() << "#pragma unroll " << loop->circularBufferLoopStageDepth()
-               << "\n";
+      // NOTE: requireUnroll is sometimes called on a circular-buffered matmul
+      // loops when static shapes are used. To avoid hinting that the compiler
+      // should maximally unroll such loops leading to very long compiles, we
+      // handle that case explicitly here and ignore loop->isUnrolled().
+      //
+      // Unroll "prefetch" many circular buffered loops regardless of buffer
+      // stage (prologue, main, or epilogue)
+      int64_t prefetch = kernel_->summary()
+                             .circular_buffer_info
+                             .getCircularBufferOptionsFor(loop->iter_domain())
+                             .prefetch;
+      indent() << "#pragma unroll " << prefetch << "\n";
+    } else if (loop->isUnrolled()) {
+      indent() << "#pragma unroll\n";
     } else {
       indent() << "#pragma unroll 1\n";
     }

diff --git a/csrc/device_lower/analysis/circular_buffer.cpp b/csrc/device_lower/analysis/circular_buffer.cpp
@@ -232,9 +232,11 @@ IterDomain* CircularBufferInfo::getCircularBufferAxis(
 
 const CircularBufferOptions& CircularBufferInfo::getCircularBufferOptionsFor(
     IterDomain* circular_buffer_axis) const {
-  auto concrete_id = lower_utils::getConcreteLoopID(circular_buffer_axis);
+  if (GpuLower::hasCurrent()) {
+    circular_buffer_axis = lower_utils::getConcreteLoopID(circular_buffer_axis);
+  }
 
-  auto maybe_depth_it = circular_buffer_options_.find(concrete_id);
+  auto maybe_depth_it = circular_buffer_options_.find(circular_buffer_axis);
 
   NVF_ERROR(
       maybe_depth_it != circular_buffer_options_.end(),

diff --git a/tests/cpp/test_loop_rotation.cpp b/tests/cpp/test_loop_rotation.cpp
@@ -307,7 +307,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   nvfuser_index_t i0;
   i0 = 4LL * T0.alloc_stride[0LL];
   float T1[15LL];
-  #pragma unroll
+  #pragma unroll 4
   for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
     nvfuser_index_t i2;
     i2 = 3LL * i1;
@@ -335,7 +335,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
        = T1[i6];
   }
   NVFUSER_UPDATE_MAGIC_ZERO;
-  #pragma unroll 5
+  #pragma unroll 4
   for(nvfuser_index_t i7 = 0LL; i7 < T0.logical_size[0LL]; ++i7) {
     nvfuser_index_t i8;
     i8 = 4LL + i7;
@@ -433,7 +433,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
        = T0[(T0.alloc_stride[1LL] * (i3 + nvfuser_zero))];
   }
   NVFUSER_UPDATE_MAGIC_ZERO;
-  #pragma unroll
+  #pragma unroll 4
   for(nvfuser_index_t i4 = 0LL; i4 < 4LL; ++i4) {
     nvfuser_index_t i5;
     i5 = 3LL + (3LL * i4);
@@ -474,7 +474,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
        = T1[i8];
   }
   NVFUSER_UPDATE_MAGIC_ZERO;
-  #pragma unroll 5
+  #pragma unroll 4
   for(nvfuser_index_t i9 = 0LL; i9 < T0.logical_size[0LL]; ++i9) {
     nvfuser_index_t i10;
     i10 = 3LL * i9;
@@ -572,7 +572,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   i0 = toSmem(T4);
   float* ptr1;
   ptr1 = T0.data + (4LL * T0.alloc_stride[0LL]);
-  #pragma unroll
+  #pragma unroll 4
   for(nvfuser_index_t i2 = 0LL; i2 < 4LL; ++i2) {
     float* ptr3;
     ptr3 = T0.data + (T0.alloc_stride[0LL] * i2);
@@ -602,7 +602,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
   float T1[2LL];
   T1[0LL]
      = T4[0LL];
-  #pragma unroll 5
+  #pragma unroll 4
   for(nvfuser_index_t i7 = 0LL; i7 < T0.logical_size[0LL]; ++i7) {
     float* ptr8;
     ptr8 = ptr1 + (T0.alloc_stride[0LL] * i7);
@@ -633,7 +633,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
     }
     NVFUSER_UPDATE_MAGIC_ZERO;
     asm volatile("cp.async.commit_group;\n");
-    #pragma unroll
+    #pragma unroll 1
     for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
       T1[((1LL + i14) % 2LL)]
          = T4[(i11 + i14)];