Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cacheInputs propagates allocation only for matmul schedulers. #3621

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion csrc/scheduler/ampere_multi_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() {
scheduler_utils::clearMemorySpace(fusion_);

// Cache inputs
scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
scheduler_utils::cacheInputs(
fusion_, /*unroll=*/true, /*propagate_allocation=*/true);

// Cache and fork outputs
cached_outputs_ =
Expand Down
3 changes: 2 additions & 1 deletion csrc/scheduler/hopper_multi_matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() {
scheduler_utils::clearMemorySpace(fusion_);

// Cache inputs
scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
scheduler_utils::cacheInputs(
fusion_, /*unroll=*/true, /*propagate_allocation=*/true);

// Cache and fork outputs
scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true);
Expand Down
13 changes: 8 additions & 5 deletions csrc/scheduler/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1187,7 +1187,10 @@ void clearMemorySpace(Fusion* fusion) {

// Returns cached after tensors of the fusion inputs if unrolled. Otherwise
// return empty vector.
std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
std::vector<TensorView*> cacheInputs(
Fusion* fusion,
bool unroll,
bool propagate_allocation) {
if (!unroll) {
return {};
}
Expand Down Expand Up @@ -1224,10 +1227,10 @@ std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
}

auto cached_tv = tv->cacheAfter(
/*op_type=*/LoadStoreOpType::Set,
/*cache_op=*/CacheOp::Unspecified,
/*propagate_allocation_domain=*/true,
/*cached_uses=*/cached_uses);
LoadStoreOpType::Set,
CacheOp::Unspecified,
propagate_allocation,
cached_uses);
cached_inputs.emplace_back(cached_tv);
}
return cached_inputs;
Expand Down
5 changes: 4 additions & 1 deletion csrc/scheduler/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,10 @@ NVF_API void clearMemorySpace(Fusion* fusion);

// Returns cached after tensors of the fusion inputs if unrolled. Otherwise
// return empty vector.
NVF_API std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll);
NVF_API std::vector<TensorView*> cacheInputs(
Fusion* fusion,
bool unroll,
bool propagate_allocation = false);

// Returns the pairs of <cache of each fusion output, corresponding output> for
// all outputs.
Expand Down
4 changes: 1 addition & 3 deletions tests/cpp/test_allocation_domain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1426,11 +1426,9 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
fusion->addInput(in);
fusion->addOutput(out);

// Ideally, loop should stay the same as logical because a fusion input comes
// from outside and isn't generated by a loop in the containing kernel (cf.
// #3479).
in->split(0, 2);
in->setAllocationDomain(in->getLoopDomain(), true);
in->setLoopDomain(in->getLogicalDomain());

FusionExecutorCache executor_cache(std::move(fusion));
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
Expand Down
Loading