Skip to content

Commit 128479b

Browse files
committed
add default warp specialization
1 parent 69ef8c9 commit 128479b

File tree

3 files changed

+6
-4
lines changed

3 files changed

+6
-4
lines changed

csrc/device_lower/pass/allocation.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ class AllocationInserter : public kir::ExprMutator {
601601
// generic-async proxy fence and wgmma fence before each mma
602602
// instruction. For this case, we need to insert these fences
603603
// after the initialization of the accumulator, so that the
604-
// inilization is visible to the async proxy.
604+
// initialization is visible to the async proxy.
605605
// When all inputs are guarded by mbarrier, we will insert these
606606
// fences before each mma instruction, so there is no need to
607607
// insert them after the initialization of the accumulator here.

csrc/device_lower/pass/insert_syncs.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -782,7 +782,7 @@ class ReadAfterWriteSyncs : public kir::ExprMutator {
782782
}
783783
};
784784

785-
// Insert wait expressions for WAR harzard for async operations such as wgmma
785+
// Insert wait expressions for WAR hazard for async operations such as wgmma
786786
// and tma store. To do so, we find the structure like the following example:
787787
// for 1
788788
// for 2

csrc/scheduler/hopper_multi_matmul.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -633,15 +633,17 @@ void HopperMultipleMatmulScheduler::setUpCircularBuffering() {
633633
/*prefetch_distance=*/
634634
params_->circular_buffer_options.smem_circular_buffer_stage -
635635
params_->circular_buffer_options
636-
.smem_circular_buffer_prefetch_gap);
636+
.smem_circular_buffer_prefetch_gap,
637+
WarpSpecialized(ParallelType::TIDy));
637638
}
638639
for (TensorView* bcw_smem : bcw_smems_) {
639640
bcw_smem->circularBuffer(
640641
params_->circular_buffer_options.smem_circular_buffer_stage,
641642
/*prefetch_distance=*/
642643
params_->circular_buffer_options.smem_circular_buffer_stage -
643644
params_->circular_buffer_options
644-
.smem_circular_buffer_prefetch_gap);
645+
.smem_circular_buffer_prefetch_gap,
646+
WarpSpecialized(ParallelType::TIDy));
645647
}
646648
}
647649

0 commit comments

Comments
 (0)