Skip to content

Commit

Permalink
Limit unrolling of all circular buffered loops to depth equal to pref…
Browse files Browse the repository at this point in the history
…etch (#3627)

Currently for dynamic shapes with circular buffered loops we unroll the
following loops to different depths:
- epilogue: stages - 1 supposedly, but often specified as `#pragma
unroll` probably due to use of `ensureStaticIndexing` in the indexing
pass since this loop always has constant extent.
- main loop: unrolled as `#pragma unroll stages`
- prologue: fully unrolled `#pragma unroll` similar to epilogue.

This PR unrolls each of these loops explicitly by `#pragma prefetch`
where prefetch is the circular buffering prefetch distance which is
usually set to `stages - 1`.

### Motivation

When using static shapes like in Fusions we receive from Thunder, I
noticed that our matmul main loops are being fully unrolled (at least
this is requested but the compiler likely does not fully unroll). For
example I have seen this:
```c++
  #pragma unroll
  for(nvfuser_index_t i68 = 0; i68 < 160; ++i68)
```
This particular kernel took 35 _seconds_ to compile. After this change,
we will instead do the following:
```c++
  #pragma unroll 3
  for(nvfuser_index_t i68 = 0; i68 < 160; ++i68)
```
and the compile time is under 400 ms with no change to kernel runtime.
  • Loading branch information
jacobhinkle authored Dec 24, 2024
1 parent 6143a6b commit e214d37
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 19 deletions.
25 changes: 15 additions & 10 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3026,17 +3026,22 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
} else {
step_code << gen_index << " += " << gen_step;
}
if (loop->isUnrolled()) {
indent() << "#pragma unroll\n";
} else if (
loop->circularBufferLoopStage() == CircularBufferLoopStage::Epilog) {
indent() << "#pragma unroll " << loop->circularBufferLoopStageDepth() - 1
<< "\n";
} else if (
loop->circularBufferLoopStage() !=
if (loop->circularBufferLoopStage() !=
CircularBufferLoopStage::NotApplicable) {
indent() << "#pragma unroll " << loop->circularBufferLoopStageDepth()
<< "\n";
// NOTE: requireUnroll is sometimes called on a circular-buffered matmul
// loops when static shapes are used. To avoid hinting that the compiler
// should maximally unroll such loops leading to very long compiles, we
// handle that case explicitly here and ignore loop->isUnrolled().
//
// Unroll "prefetch" many circular buffered loops regardless of buffer
// stage (prologue, main, or epilogue)
int64_t prefetch = kernel_->summary()
.circular_buffer_info
.getCircularBufferOptionsFor(loop->iter_domain())
.prefetch;
indent() << "#pragma unroll " << prefetch << "\n";
} else if (loop->isUnrolled()) {
indent() << "#pragma unroll\n";
} else {
indent() << "#pragma unroll 1\n";
}
Expand Down
6 changes: 4 additions & 2 deletions csrc/device_lower/analysis/circular_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,11 @@ IterDomain* CircularBufferInfo::getCircularBufferAxis(

const CircularBufferOptions& CircularBufferInfo::getCircularBufferOptionsFor(
IterDomain* circular_buffer_axis) const {
auto concrete_id = lower_utils::getConcreteLoopID(circular_buffer_axis);
if (GpuLower::hasCurrent()) {
circular_buffer_axis = lower_utils::getConcreteLoopID(circular_buffer_axis);
}

auto maybe_depth_it = circular_buffer_options_.find(concrete_id);
auto maybe_depth_it = circular_buffer_options_.find(circular_buffer_axis);

NVF_ERROR(
maybe_depth_it != circular_buffer_options_.end(),
Expand Down
14 changes: 7 additions & 7 deletions tests/cpp/test_loop_rotation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
nvfuser_index_t i0;
i0 = 4LL * T0.alloc_stride[0LL];
float T1[15LL];
#pragma unroll
#pragma unroll 4
for(nvfuser_index_t i1 = 0LL; i1 < 4LL; ++i1) {
nvfuser_index_t i2;
i2 = 3LL * i1;
Expand Down Expand Up @@ -335,7 +335,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
= T1[i6];
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 5
#pragma unroll 4
for(nvfuser_index_t i7 = 0LL; i7 < T0.logical_size[0LL]; ++i7) {
nvfuser_index_t i8;
i8 = 4LL + i7;
Expand Down Expand Up @@ -433,7 +433,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
= T0[(T0.alloc_stride[1LL] * (i3 + nvfuser_zero))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
#pragma unroll 4
for(nvfuser_index_t i4 = 0LL; i4 < 4LL; ++i4) {
nvfuser_index_t i5;
i5 = 3LL + (3LL * i4);
Expand Down Expand Up @@ -474,7 +474,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
= T1[i8];
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 5
#pragma unroll 4
for(nvfuser_index_t i9 = 0LL; i9 < T0.logical_size[0LL]; ++i9) {
nvfuser_index_t i10;
i10 = 3LL * i9;
Expand Down Expand Up @@ -572,7 +572,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
i0 = toSmem(T4);
float* ptr1;
ptr1 = T0.data + (4LL * T0.alloc_stride[0LL]);
#pragma unroll
#pragma unroll 4
for(nvfuser_index_t i2 = 0LL; i2 < 4LL; ++i2) {
float* ptr3;
ptr3 = T0.data + (T0.alloc_stride[0LL] * i2);
Expand Down Expand Up @@ -602,7 +602,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
float T1[2LL];
T1[0LL]
= T4[0LL];
#pragma unroll 5
#pragma unroll 4
for(nvfuser_index_t i7 = 0LL; i7 < T0.logical_size[0LL]; ++i7) {
float* ptr8;
ptr8 = ptr1 + (T0.alloc_stride[0LL] * i7);
Expand Down Expand Up @@ -633,7 +633,7 @@ __global__ void CUDAGeneratedKernel(Tensor<float, 2, 2> T0, Tensor<float, 2, 2>
}
NVFUSER_UPDATE_MAGIC_ZERO;
asm volatile("cp.async.commit_group;\n");
#pragma unroll
#pragma unroll 1
for(nvfuser_index_t i14 = 0LL; i14 < 2LL; ++i14) {
T1[((1LL + i14) % 2LL)]
= T4[(i11 + i14)];
Expand Down

0 comments on commit e214d37

Please sign in to comment.