From 58e1514e18463d6681b02a0b5e8137e319ab3e9d Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 25 Nov 2024 19:49:50 -0800
Subject: [PATCH] eraseInputDistinctRootDomains supports general
 logical-to-allocation transforms (#3458)

This is a spin-off from #3444.

The current code assumes that logical-to-allocation has to be a
permutation. This assumption won't hold any more with #2563. So this PR
tries to extend eraseInputDistinctRootDomains to support more general
transforms.

This can happen to single-GPU, although not as common. The tests added
in this PR are for single-GPU because #3444 hasn't landed. #3444 will
add some multi-GPU tests.
---
 csrc/dynamic_transform.cpp           |  6 ---
 csrc/fusion_segmenter.cpp            | 70 ++++++++++++++++++----------
 csrc/ir/nodes.cpp                    |  2 +-
 tests/cpp/test_allocation_domain.cpp | 53 ++++++++++++++++++++-
 4 files changed, 98 insertions(+), 33 deletions(-)
diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
index 49afbac974e..24404db8d65 100644
--- a/csrc/dynamic_transform.cpp
+++ b/csrc/dynamic_transform.cpp
@@ -1048,12 +1048,6 @@ void DynamicTransformConcretizer::mutate(TensorView* tv) {
   // check the root to logical transforms to be sure we have concretized any
   // intermediate IterDomains.
 
-  // At this point, there should be no expr beyond rfactor root
-  NVF_ERROR(
-      tv->getLoopDomain() == tv->getLogicalDomain(),
-      "Invalid tensor: ",
-      tv->toString());
-
   // If it has an root domain, the IterTypes of the logical
   // IDs may need to be updated as well. Traverse the rfactor exprs
   // and mutate the IterTypes of output IDs if symbolic.
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 17f5227b066..506a2e81987 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -5,6 +5,9 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <algorithm>
+#include <sstream>
+
 #include <debug.h>
 #include <fusion.h>
 #include <fusion_segmenter.h>
@@ -20,9 +23,7 @@
 #include <options.h>
 #include <scheduler/debug_utils.h>
 #include <scheduler/normalization_utils.h>
-#include <algorithm>
-
-#include <sstream>
+#include <transform_iter.h>
 
 namespace nvfuser {
 
@@ -1860,35 +1861,56 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
       }
     }
 
-    NVF_ERROR(new_logical_domain.size() == tv->domain()->contiguity().size());
     TensorDomain* new_td = nullptr;
-
     if (tv->domain()->hasAllocation()) {
       // we need to reorder the logical domain into allocation domain
       // consistently with the mapping from the old TensorView logical domain to
       // its allocation domain
-      const auto& alloc = tv->getAllocationDomain();
-      NVF_ERROR(
-          alloc.size() == logical.size(),
-          "size between logical and alloc doesn't match");
-      const auto rank = alloc.size();
-      std::vector<int64_t> stride_order(rank, -1);
-      for (auto i : c10::irange(rank)) {
-        bool found_match = false;
-        for (auto j : c10::irange(rank)) {
-          if (alloc[i] == logical[j]) {
-            stride_order[j] = static_cast<int64_t>(rank - 1 - i);
-            found_match = true;
-            break;
-          }
-        }
+      std::unordered_map<IterDomain*, IterDomain*> old_to_new;
+      for (const auto i : c10::irange(logical.size())) {
+        old_to_new.emplace(logical[i], new_logical_domain[i]);
+      }
+
+      ReplayTransformations replay(tv->getAllocationDomain(), old_to_new);
+      // Without this,
+      // https://github.com/NVIDIA/Fuser/blob/e613929a6c21b3095c8817b01b8f177096a26e60/csrc/transform_iter.cpp#L299
+      // tries to look for root IDs in the map, which shouldn't exist because
+      // the whole purpose of this function is to remove the root domain.
+      replay.setErrorOnFailure(false);
+      // We don't need replay.setReplayRFactor(true). The new root is the same
+      // as the new logical so there aren't any expressions between them.
+
+      std::vector<IterDomain*> new_alloc;
+      new_alloc.reserve(tv->getAllocationDomain().size());
+      for (IterDomain* alloc_id : tv->getAllocationDomain()) {
+        new_alloc.push_back(replay.getReplay().at(alloc_id));
+      }
+
+      std::vector<IterDomain*> new_loop;
+      if (tv->getLoopDomain() == tv->getAllocationDomain()) {
+        new_loop = new_alloc;
+      } else {
         NVF_ERROR(
-            found_match,
-            "cannot match IterDomain between allocation domain to logical domain");
+            tv->getLoopDomain() == tv->getLogicalDomain(),
+            tv,
+            " has an unexpected loop domain:\n",
+            tv->domain()->toString(0, /*loop_only=*/false));
+
+        new_loop = new_logical_domain;
       }
+
       new_td = IrBuilder::create<TensorDomain>(
-          new_logical_domain, stride_order, tv->domain()->contiguity());
+          /*root_domain=*/std::vector<IterDomain*>(),
+          new_logical_domain,
+          new_alloc,
+          new_loop,
+          tv->domain()->contiguity());
     } else {
+      NVF_ERROR(
+          tv->getLoopDomain() == tv->getLogicalDomain(),
+          tv,
+          " has an unexpected loop domain:\n",
+          tv->domain()->toString(0, /*loop_only=*/false));
       new_td = IrBuilder::create<TensorDomain>(
           new_logical_domain, tv->domain()->contiguity());
     }
@@ -1909,7 +1931,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
             /*root_domain=*/std::vector<IterDomain*>{},
             /*logical_domain=*/new_logical,
             /*allocation=*/TensorDomain::noReductions(new_td->allocation()),
-            /*loop_domain=*/new_logical,
+            /*loop_domain=*/TensorDomain::noReductions(new_td->loop()),
             /*contiguity=*/no_red_contiguity);
       } else {
         new_td = IrBuilder::create<TensorDomain>(
diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index a74d32b6d67..c93c4980e85 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -3251,7 +3251,7 @@ std::string TensorDomain::toString(const int indent_size, const bool loop_only)
     }
     ss << "," << std::endl;
     indent(ss, indent_size + 1)
-        << "rfactor=[ " << toDelimitedString(logical()) << " ]";
+        << "logical=[ " << toDelimitedString(logical()) << " ]";
     if (!allocation_domain_.empty()) {
       ss << "," << std::endl;
       indent(ss, indent_size + 1)
diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
index 55ac0ee99d4..a4c6829d122 100644
--- a/tests/cpp/test_allocation_domain.cpp
+++ b/tests/cpp/test_allocation_domain.cpp
@@ -1384,17 +1384,20 @@ TEST_F(AllocationDomainTest, ReductionVectorization) {
 }
 
 TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) {
-  auto fusion = std::make_unique<Fusion>();
-  FusionGuard fg(fusion.get());
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
   auto tv0 = TensorViewBuilder()
                  .ndims(3)
                  .shape({-1, 1, -1})
                  .contiguity({true, std::nullopt, true})
                  .build();
   auto tv1 = sum(tv0, {2});
+
   tv1->setAllocationDomain(
       {tv1->axis(1), tv1->axis(2), tv1->axis(0)},
       {std::nullopt, std::nullopt, true});
+
   // copy entries from old domain for validation later
   std::vector<IterDomain*> logical_copy = tv1->getLogicalDomain();
   std::vector<IterDomain*> alloc_copy = tv1->getAllocationDomain();
@@ -1414,4 +1417,50 @@ TEST_F(AllocationDomainTest, ClearReductionIterDomainsPatch) {
       tv1->getContiguity(), ElementsAre(contig_copy[0], contig_copy[2]));
 }
 
+TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* in = makeContigConcreteTensor({6});
+  TensorView* out = set(in);
+  fusion->addInput(in);
+  fusion->addOutput(out);
+
+  in->split(0, 2);
+  in->setAllocationDomain(in->getLoopDomain(), true);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor in_tensor = at::randn({6}, options);
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+}
+
+// The test fails as is. The symbolic IterDomains in loop/allocation are not
+// concretized. I tried to change DynamicTransformConcretizer::mutate to grab
+// all expressions between root and allocation but still couldn't get it to
+// work.
+TEST_F(AllocationDomainTest, DISABLED_InputAllocationIsSplit_Symbolic) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* in = makeContigTensor(1);
+  TensorView* out = set(in);
+  fusion->addInput(in);
+  fusion->addOutput(out);
+
+  in->split(0, 2);
+  in->setAllocationDomain(in->getLoopDomain(), true);
+
+  FusionExecutorCache executor_cache(std::move(fusion));
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);
+  at::Tensor in_tensor = at::randn({6}, options);
+  auto out_tensors = executor_cache.runFusionWithInputs({in_tensor});
+
+  testValidate(
+      executor_cache.fusion(), out_tensors, {in_tensor}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser