diff --git a/__tmp_kernel_none_f0_c0_r0_g0.cu b/__tmp_kernel_none_f0_c0_r0_g0.cu index 4d7cd525a32..ea10ce2335c 100644 --- a/__tmp_kernel_none_f0_c0_r0_g0.cu +++ b/__tmp_kernel_none_f0_c0_r0_g0.cu @@ -11344,6 +11344,7 @@ __global__ void __cluster_dims__(2, 1, 1) nvfuser_none_f0_c0_r0_g0(Tensor<__half } __syncthreads(); if ((((nvfuser_index_t)threadIdx.y) == 2)) { + asm volatile("{setmaxnreg.dec.sync.aligned.u32 56; \n\t}"); if ((Hopper::electSync(4294967295U) && b16)) { #pragma unroll 4 for(nvfuser_index_t i24 = 0; i24 < i3; ++i24) { @@ -11368,11 +11369,13 @@ __global__ void __cluster_dims__(2, 1, 1) nvfuser_none_f0_c0_r0_g0(Tensor<__half } } } + return; } else { #pragma unroll for(nvfuser_index_t i31 = 0; i31 < 4; ++i31) { mbarrier::arrive(toSmem((&T8[(i31 + 4LL)]))); } + asm volatile("{setmaxnreg.inc.sync.aligned.u32 224; \n\t}"); #pragma unroll 4 for(nvfuser_index_t i32 = 0; i32 < i3; ++i32) { nvfuser_index_t i33; diff --git a/report3.nsys-rep b/report3.nsys-rep new file mode 100644 index 00000000000..1ef9a36451c Binary files /dev/null and b/report3.nsys-rep differ diff --git a/report3.sqlite b/report3.sqlite new file mode 100644 index 00000000000..dff2c9517bc Binary files /dev/null and b/report3.sqlite differ