From 6685991198a3a6bbe0054b502b2644a6510ad823 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Tue, 26 Nov 2024 10:44:41 -0800
Subject: [PATCH 1/2] Add a repro for #3479

---
 tests/cpp/test_allocation_domain.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index bff62bb98e1..a726dd6b262 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -1426,11 +1426,9 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
-  // Ideally, loop should stay the same as logical because a fusion input comes
-  // from outside and isn't generated by a loop in the containing kernel (cf.
-  // #3479).
   in->split(0, 2);
   in->setAllocationDomain(in->getLoopDomain(), true);
+  in->setLoopDomain(in->getLogicalDomain());
 
   FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);

From 0d8697adac74d0a016665d8f61d747190de09052 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 19 Dec 2024 11:57:53 -0800
Subject: [PATCH 2/2] cacheInputs propagates allocation only for matmul
 schedulers.

---
 csrc/scheduler/ampere_multi_matmul.cpp |  3 ++-
 csrc/scheduler/hopper_multi_matmul.cpp |  3 ++-
 csrc/scheduler/utils.cpp               | 13 ++++++++-----
 csrc/scheduler/utils.h                 |  5 ++++-
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp
index d582e9e9a10..60a237006a4 100644
--- a/csrc/scheduler/ampere_multi_matmul.cpp
+++ b/csrc/scheduler/ampere_multi_matmul.cpp
@@ -489,7 +489,8 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
+  scheduler_utils::cacheInputs(
+      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
 
   // Cache and fork outputs
   cached_outputs_ =
diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
index cedc7d262d5..714681d045c 100644
--- a/csrc/scheduler/hopper_multi_matmul.cpp
+++ b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -101,7 +101,8 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
+  scheduler_utils::cacheInputs(
+      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
 
   // Cache and fork outputs
   scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true);
diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
index 79920ec96c7..5f7953e9352 100644
--- a/csrc/scheduler/utils.cpp
+++ b/csrc/scheduler/utils.cpp
@@ -1187,7 +1187,10 @@ void clearMemorySpace(Fusion* fusion) {
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
+std::vector<TensorView*> cacheInputs(
+    Fusion* fusion,
+    bool unroll,
+    bool propagate_allocation) {
   if (!unroll) {
     return {};
   }
@@ -1224,10 +1227,10 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
     }
 
     auto cached_tv = tv->cacheAfter(
-        /*op_type=*/LoadStoreOpType::Set,
-        /*cache_op=*/CacheOp::Unspecified,
-        /*propagate_allocation_domain=*/true,
-        /*cached_uses=*/cached_uses);
+        LoadStoreOpType::Set,
+        CacheOp::Unspecified,
+        propagate_allocation,
+        cached_uses);
     cached_inputs.emplace_back(cached_tv);
   }
   return cached_inputs;
diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
index 29f7f12efc6..9aa60378be6 100644
--- a/csrc/scheduler/utils.h
+++ b/csrc/scheduler/utils.h
@@ -334,7 +334,10 @@ NVF_API void clearMemorySpace(Fusion* fusion);
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-NVF_API std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll);
+NVF_API std::vector<TensorView*> cacheInputs(
+    Fusion* fusion,
+    bool unroll,
+    bool propagate_allocation = false);
 
 // Returns the pairs of <cache of each fusion output, corresponding output> for
 // all outputs.