Skip to content

Commit

Permalink
eraseInputDistinctRootDomains supports general logical-to-allocation …
Browse files Browse the repository at this point in the history
…transforms (#3458)

This is a spin-off from #3444. 

The current code assumes that logical-to-allocation has to be a
permutation. This assumption won't hold any more with #2563. So this PR
tries to extend eraseInputDistinctRootDomains to support more general
transforms.

This can happen to single-GPU, although not as common. The tests added
in this PR are for single-GPU because #3444 hasn't landed. #3444 will
add some multi-GPU tests.
  • Loading branch information
wujingyue authored Nov 26, 2024
1 parent c4a0335 commit 58e1514
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 33 deletions.
6 changes: 0 additions & 6 deletions csrc/dynamic_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,12 +1048,6 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
// check the root to logical transforms to be sure we have concretized any
// intermediate IterDomains.

// At this point, there should be no expr beyond rfactor root
NVF_ERROR(
tv->getLoopDomain() == tv->getLogicalDomain(),
"Invalid tensor: ",
tv->toString());

// If it has an root domain, the IterTypes of the logical
// IDs may need to be updated as well. Traverse the rfactor exprs
// and mutate the IterTypes of output IDs if symbolic.
Expand Down
70 changes: 46 additions & 24 deletions csrc/fusion_segmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#include <algorithm>
#include <sstream>

#include <debug.h>
#include <fusion.h>
#include <fusion_segmenter.h>
Expand All @@ -20,9 +23,7 @@
#include <options.h>
#include <scheduler/debug_utils.h>
#include <scheduler/normalization_utils.h>
#include <algorithm>

#include <sstream>
#include <transform_iter.h>

namespace nvfuser {

Expand Down Expand Up @@ -1860,35 +1861,56 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
}
}

NVF_ERROR(new_logical_domain.size() == tv->domain()->contiguity().size());
TensorDomain* new_td = nullptr;

if (tv->domain()->hasAllocation()) {
// we need to reorder the logical domain into allocation domain
// consistently with the mapping from the old TensorView logical domain to
// its allocation domain
const auto& alloc = tv->getAllocationDomain();
NVF_ERROR(
alloc.size() == logical.size(),
"size between logical and alloc doesn't match");
const auto rank = alloc.size();
std::vector<int64_t> stride_order(rank, -1);
for (auto i : c10::irange(rank)) {
bool found_match = false;
for (auto j : c10::irange(rank)) {
if (alloc[i] == logical[j]) {
stride_order[j] = static_cast<int64_t>(rank - 1 - i);
found_match = true;
break;
}
}
std::unordered_map<IterDomain*, IterDomain*> old_to_new;
for (const auto i : c10::irange(logical.size())) {
old_to_new.emplace(logical[i], new_logical_domain[i]);
}

ReplayTransformations replay(tv->getAllocationDomain(), old_to_new);
// Without this,
// https://github.com/NVIDIA/Fuser/blob/e613929a6c21b3095c8817b01b8f177096a26e60/csrc/transform_iter.cpp#L299
// tries to look for root IDs in the map, which shouldn't exist because
// the whole purpose of this function is to remove the root domain.
replay.setErrorOnFailure(false);
// We don't need replay.setReplayRFactor(true). The new root is the same
// as the new logical so there aren't any expressions between them.

std::vector<IterDomain*> new_alloc;
new_alloc.reserve(tv->getAllocationDomain().size());
for (IterDomain* alloc_id : tv->getAllocationDomain()) {
new_alloc.push_back(replay.getReplay().at(alloc_id));
}

std::vector<IterDomain*> new_loop;
if (tv->getLoopDomain() == tv->getAllocationDomain()) {
new_loop = new_alloc;
} else {
NVF_ERROR(
found_match,
"cannot match IterDomain between allocation domain to logical domain");
tv->getLoopDomain() == tv->getLogicalDomain(),
tv,
" has an unexpected loop domain:\n",
tv->domain()->toString(0, /*loop_only=*/false));

new_loop = new_logical_domain;
}

new_td = IrBuilder::create<TensorDomain>(
new_logical_domain, stride_order, tv->domain()->contiguity());
/*root_domain=*/std::vector<IterDomain*>(),
new_logical_domain,
new_alloc,
new_loop,
tv->domain()->contiguity());
} else {
NVF_ERROR(
tv->getLoopDomain() == tv->getLogicalDomain(),
tv,
" has an unexpected loop domain:\n",
tv->domain()->toString(0, /*loop_only=*/false));
new_td = IrBuilder::create<TensorDomain>(
new_logical_domain, tv->domain()->contiguity());
}
Expand All @@ -1909,7 +1931,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
/*root_domain=*/std::vector<IterDomain*>{},
/*logical_domain=*/new_logical,
/*allocation=*/TensorDomain::noReductions(new_td->allocation()),
/*loop_domain=*/new_logical,
/*loop_domain=*/TensorDomain::noReductions(new_td->loop()),
/*contiguity=*/no_red_contiguity);
} else {
new_td = IrBuilder::create<TensorDomain>(
Expand Down
2 changes: 1 addition & 1 deletion csrc/ir/nodes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3251,7 +3251,7 @@ std::string TensorDomain::toString(const int indent_size, const bool loop_only)
}
ss << "," << std::endl;
indent(ss, indent_size + 1)
<< "rfactor=[ " << toDelimitedString(logical()) << " ]";
<< "logical=[ " << toDelimitedString(logical()) << " ]";
if (!allocation_domain_.empty()) {
ss << "," << std::endl;
indent(ss, indent_size + 1)
Expand Down
53 changes: 51 additions & 2 deletions tests/cpp/test_allocation_domain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1384,17 +1384,20 @@ TEST_F(AllocationDomainTest, ReductionVectorization) {
}

TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());
Fusion fusion;
FusionGuard fg(&fusion);

auto tv0 = TensorViewBuilder()
.ndims(3)
.shape({-1, 1, -1})
.contiguity({true, std::nullopt, true})
.build();
auto tv1 = sum(tv0, {2});

tv1->setAllocationDomain(
{tv1->axis(1), tv1->axis(2), tv1->axis(0)},
{std::nullopt, std::nullopt, true});

// copy entries from old domain for validation later
std::vector<IterDomain*> logical_copy = tv1->getLogicalDomain();
std::vector<IterDomain*> alloc_copy = tv1->getAllocationDomain();
Expand All @@ -1414,4 +1417,50 @@ TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) {
tv1->getContiguity(), ElementsAre(contig_copy[0], contig_copy[2]));
}

TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

TensorView* in = makeContigConcreteTensor({6});
TensorView* out = set(in);
fusion->addInput(in);
fusion->addOutput(out);

in->split(0, 2);
in->setAllocationDomain(in->getLoopDomain(), true);

FusionExecutorCache executor_cache(std::move(fusion));
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
at::Tensor in_tensor = at::randn({6}, options);
auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});

testValidate(
executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

// The test fails as is. The symbolic IterDomains in loop/allocation are not
// concretized. I tried to change DynamicTransformConcretizer::mutate to grab
// all expressions between root and allocation but still couldn't get it to
// work.
TEST_F(AllocationDomainTest, DISABLED_InputAllocationIsSplit_Symbolic) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

TensorView* in = makeContigTensor(1);
TensorView* out = set(in);
fusion->addInput(in);
fusion->addOutput(out);

in->split(0, 2);
in->setAllocationDomain(in->getLoopDomain(), true);

FusionExecutorCache executor_cache(std::move(fusion));
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
at::Tensor in_tensor = at::randn({6}, options);
auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});

testValidate(
executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
}

} // namespace nvfuser

0 comments on commit 58e1514

Please sign in to comment.