NVIDIA · wujingyue · Nov 26, 2024 · Dec 19, 2024
diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp
@@ -489,7 +489,8 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
+  scheduler_utils::cacheInputs(
+      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
 
   // Cache and fork outputs
   cached_outputs_ =

diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -101,7 +101,8 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
+  scheduler_utils::cacheInputs(
+      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
 
   // Cache and fork outputs
   scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true);

diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
@@ -1187,7 +1187,10 @@ void clearMemorySpace(Fusion* fusion) {
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
+std::vector<TensorView*> cacheInputs(
+    Fusion* fusion,
+    bool unroll,
+    bool propagate_allocation) {
   if (!unroll) {
     return {};
   }
@@ -1224,10 +1227,10 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
     }
 
     auto cached_tv = tv->cacheAfter(
-        /*op_type=*/LoadStoreOpType::Set,
-        /*cache_op=*/CacheOp::Unspecified,
-        /*propagate_allocation_domain=*/true,
-        /*cached_uses=*/cached_uses);
+        LoadStoreOpType::Set,
+        CacheOp::Unspecified,
+        propagate_allocation,
+        cached_uses);
     cached_inputs.emplace_back(cached_tv);
   }
   return cached_inputs;

diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
@@ -334,7 +334,10 @@ NVF_API void clearMemorySpace(Fusion* fusion);
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-NVF_API std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll);
+NVF_API std::vector<TensorView*> cacheInputs(
+    Fusion* fusion,
+    bool unroll,
+    bool propagate_allocation = false);
 
 // Returns the pairs of <cache of each fusion output, corresponding output> for
 // all outputs.

diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
@@ -1426,11 +1426,9 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  // Ideally, loop should stay the same as logical because a fusion input comes
-  // from outside and isn't generated by a loop in the containing kernel (cf.
-  // #3479).
   in->split(0, 2);
   in->setAllocationDomain(in->getLoopDomain(), true);
+  in->setLoopDomain(in->getLogicalDomain());
 
   FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);