From 58e1514e18463d6681b02a0b5e8137e319ab3e9d Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 25 Nov 2024 19:49:50 -0800 Subject: [PATCH] eraseInputDistinctRootDomains supports general logical-to-allocation transforms (#3458) This is a spin-off from #3444. The current code assumes that logical-to-allocation has to be a permutation. This assumption won't hold any more with #2563. So this PR tries to extend eraseInputDistinctRootDomains to support more general transforms. This can happen to single-GPU, although not as common. The tests added in this PR are for single-GPU because #3444 hasn't landed. #3444 will add some multi-GPU tests. --- csrc/dynamic_transform.cpp | 6 --- csrc/fusion_segmenter.cpp | 70 ++++++++++++++++++---------- csrc/ir/nodes.cpp | 2 +- tests/cpp/test_allocation_domain.cpp | 53 ++++++++++++++++++++- 4 files changed, 98 insertions(+), 33 deletions(-) diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp index 49afbac974e..24404db8d65 100644 --- a/csrc/dynamic_transform.cpp +++ b/csrc/dynamic_transform.cpp @@ -1048,12 +1048,6 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) { // check the root to logical transforms to be sure we have concretized any // intermediate IterDomains. - // At this point, there should be no expr beyond rfactor root - NVF_ERROR( - tv->getLoopDomain() == tv->getLogicalDomain(), - "Invalid tensor: ", - tv->toString()); - // If it has an root domain, the IterTypes of the logical // IDs may need to be updated as well. Traverse the rfactor exprs // and mutate the IterTypes of output IDs if symbolic. diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 17f5227b066..506a2e81987 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -5,6 +5,9 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include +#include + #include #include #include @@ -20,9 +23,7 @@ #include #include #include -#include - -#include +#include namespace nvfuser { @@ -1860,35 +1861,56 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { } } - NVF_ERROR(new_logical_domain.size() == tv->domain()->contiguity().size()); TensorDomain* new_td = nullptr; - if (tv->domain()->hasAllocation()) { // we need to reorder the logical domain into allocation domain // consistently with the mapping from the old TensorView logical domain to // its allocation domain - const auto& alloc = tv->getAllocationDomain(); - NVF_ERROR( - alloc.size() == logical.size(), - "size between logical and alloc doesn't match"); - const auto rank = alloc.size(); - std::vector stride_order(rank, -1); - for (auto i : c10::irange(rank)) { - bool found_match = false; - for (auto j : c10::irange(rank)) { - if (alloc[i] == logical[j]) { - stride_order[j] = static_cast(rank - 1 - i); - found_match = true; - break; - } - } + std::unordered_map old_to_new; + for (const auto i : c10::irange(logical.size())) { + old_to_new.emplace(logical[i], new_logical_domain[i]); + } + + ReplayTransformations replay(tv->getAllocationDomain(), old_to_new); + // Without this, + // https://github.com/NVIDIA/Fuser/blob/e613929a6c21b3095c8817b01b8f177096a26e60/csrc/transform_iter.cpp#L299 + // tries to look for root IDs in the map, which shouldn't exist because + // the whole purpose of this function is to remove the root domain. + replay.setErrorOnFailure(false); + // We don't need replay.setReplayRFactor(true). The new root is the same + // as the new logical so there aren't any expressions between them. + + std::vector new_alloc; + new_alloc.reserve(tv->getAllocationDomain().size()); + for (IterDomain* alloc_id : tv->getAllocationDomain()) { + new_alloc.push_back(replay.getReplay().at(alloc_id)); + } + + std::vector new_loop; + if (tv->getLoopDomain() == tv->getAllocationDomain()) { + new_loop = new_alloc; + } else { NVF_ERROR( - found_match, - "cannot match IterDomain between allocation domain to logical domain"); + tv->getLoopDomain() == tv->getLogicalDomain(), + tv, + " has an unexpected loop domain:\n", + tv->domain()->toString(0, /*loop_only=*/false)); + + new_loop = new_logical_domain; } + new_td = IrBuilder::create( - new_logical_domain, stride_order, tv->domain()->contiguity()); + /*root_domain=*/std::vector(), + new_logical_domain, + new_alloc, + new_loop, + tv->domain()->contiguity()); } else { + NVF_ERROR( + tv->getLoopDomain() == tv->getLogicalDomain(), + tv, + " has an unexpected loop domain:\n", + tv->domain()->toString(0, /*loop_only=*/false)); new_td = IrBuilder::create( new_logical_domain, tv->domain()->contiguity()); } @@ -1909,7 +1931,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { /*root_domain=*/std::vector{}, /*logical_domain=*/new_logical, /*allocation=*/TensorDomain::noReductions(new_td->allocation()), - /*loop_domain=*/new_logical, + /*loop_domain=*/TensorDomain::noReductions(new_td->loop()), /*contiguity=*/no_red_contiguity); } else { new_td = IrBuilder::create( diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index a74d32b6d67..c93c4980e85 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -3251,7 +3251,7 @@ std::string TensorDomain::toString(const int indent_size, const bool loop_only) } ss << "," << std::endl; indent(ss, indent_size + 1) - << "rfactor=[ " << toDelimitedString(logical()) << " ]"; + << "logical=[ " << toDelimitedString(logical()) << " ]"; if (!allocation_domain_.empty()) { ss << "," << std::endl; indent(ss, indent_size + 1) diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index 55ac0ee99d4..a4c6829d122 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1384,17 +1384,20 @@ TEST_F(AllocationDomainTest, ReductionVectorization) { } TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) { - auto fusion = std::make_unique(); - FusionGuard fg(fusion.get()); + Fusion fusion; + FusionGuard fg(&fusion); + auto tv0 = TensorViewBuilder() .ndims(3) .shape({-1, 1, -1}) .contiguity({true, std::nullopt, true}) .build(); auto tv1 = sum(tv0, {2}); + tv1->setAllocationDomain( {tv1->axis(1), tv1->axis(2), tv1->axis(0)}, {std::nullopt, std::nullopt, true}); + // copy entries from old domain for validation later std::vector logical_copy = tv1->getLogicalDomain(); std::vector alloc_copy = tv1->getAllocationDomain(); @@ -1414,4 +1417,50 @@ TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) { tv1->getContiguity(), ElementsAre(contig_copy[0], contig_copy[2])); } +TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + TensorView* in = makeContigConcreteTensor({6}); + TensorView* out = set(in); + fusion->addInput(in); + fusion->addOutput(out); + + in->split(0, 2); + in->setAllocationDomain(in->getLoopDomain(), true); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); + at::Tensor in_tensor = at::randn({6}, options); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); +} + +// The test fails as is. The symbolic IterDomains in loop/allocation are not +// concretized. I tried to change DynamicTransformConcretizer::mutate to grab +// all expressions between root and allocation but still couldn't get it to +// work. +TEST_F(AllocationDomainTest, DISABLED_InputAllocationIsSplit_Symbolic) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + TensorView* in = makeContigTensor(1); + TensorView* out = set(in); + fusion->addInput(in); + fusion->addOutput(out); + + in->split(0, 2); + in->setAllocationDomain(in->getLoopDomain(), true); + + FusionExecutorCache executor_cache(std::move(fusion)); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA); + at::Tensor in_tensor = at::randn({6}, options); + auto out_tensors = executor_cache.runFusionWithInputs({in_tensor}); + + testValidate( + executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__); +} + } // namespace nvfuser