From 6685991198a3a6bbe0054b502b2644a6510ad823 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 26 Nov 2024 10:44:41 -0800 Subject: [PATCH 1/2] Add a repro for #3479 --- tests/cpp/test_allocation_domain.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index bff62bb98e1..a726dd6b262 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1426,11 +1426,9 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) { fusion->addInput(in); fusion->addOutput(out); - // Ideally, loop should stay the same as logical because a fusion input comes - // from outside and isn't generated by a loop in the containing kernel (cf. - // #3479). in->split(0, 2); in->setAllocationDomain(in->getLoopDomain(), true); + in->setLoopDomain(in->getLogicalDomain()); FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); From 0d8697adac74d0a016665d8f61d747190de09052 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 19 Dec 2024 11:57:53 -0800 Subject: [PATCH 2/2] cacheInputs propagates allocation only for matmul schedulers. --- csrc/scheduler/ampere_multi_matmul.cpp | 3 ++- csrc/scheduler/hopper_multi_matmul.cpp | 3 ++- csrc/scheduler/utils.cpp | 13 ++++++++----- csrc/scheduler/utils.h | 5 ++++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp index d582e9e9a10..60a237006a4 100644 --- a/csrc/scheduler/ampere_multi_matmul.cpp +++ b/csrc/scheduler/ampere_multi_matmul.cpp @@ -489,7 +489,8 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() { scheduler_utils::clearMemorySpace(fusion_); // Cache inputs - scheduler_utils::cacheInputs(fusion_, /*unroll=*/true); + scheduler_utils::cacheInputs( + fusion_, /*unroll=*/true, /*propagate_allocation=*/true); // Cache and fork outputs cached_outputs_ = diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index cedc7d262d5..714681d045c 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -101,7 +101,8 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() { scheduler_utils::clearMemorySpace(fusion_); // Cache inputs - scheduler_utils::cacheInputs(fusion_, /*unroll=*/true); + scheduler_utils::cacheInputs( + fusion_, /*unroll=*/true, /*propagate_allocation=*/true); // Cache and fork outputs scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true); diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 79920ec96c7..5f7953e9352 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -1187,7 +1187,10 @@ void clearMemorySpace(Fusion* fusion) { // Returns cached after tensors of the fusion inputs if unrolled. Otherwise // return empty vector. -std::vector cacheInputs(Fusion* fusion, bool unroll) { +std::vector cacheInputs( + Fusion* fusion, + bool unroll, + bool propagate_allocation) { if (!unroll) { return {}; } @@ -1224,10 +1227,10 @@ std::vector cacheInputs(Fusion* fusion, bool unroll) { } auto cached_tv = tv->cacheAfter( - /*op_type=*/LoadStoreOpType::Set, - /*cache_op=*/CacheOp::Unspecified, - /*propagate_allocation_domain=*/true, - /*cached_uses=*/cached_uses); + LoadStoreOpType::Set, + CacheOp::Unspecified, + propagate_allocation, + cached_uses); cached_inputs.emplace_back(cached_tv); } return cached_inputs; diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h index 29f7f12efc6..9aa60378be6 100644 --- a/csrc/scheduler/utils.h +++ b/csrc/scheduler/utils.h @@ -334,7 +334,10 @@ NVF_API void clearMemorySpace(Fusion* fusion); // Returns cached after tensors of the fusion inputs if unrolled. Otherwise // return empty vector. -NVF_API std::vector cacheInputs(Fusion* fusion, bool unroll); +NVF_API std::vector cacheInputs( + Fusion* fusion, + bool unroll, + bool propagate_allocation = false); // Returns the pairs of for // all outputs.