From e72f9e55b8e5f339ed03c3b7fe8e69aad60a741a Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <naoyam@users.noreply.github.com>
Date: Thu, 19 Dec 2024 20:19:09 -0800
Subject: [PATCH] Allow fusion of multiple exclusive resize ops (#3611)

Followup to #3556. Currently, the resize scheduler is only allowed with
a single slice or pad. This PR allows for fusing multiple ops as long as
they don't conflict. Please see the
[comment](https://github.com/NVIDIA/Fuser/pull/3611/files#diff-b066c49d399243d3be36a44f1221490b9a2f50e41074feab836bc9bb6ee71180R25-R100)
for `getNonExclusiveResizeInfo`.

In this PR, if there's a conflict, the fusion is simply rejected. A
followup PR will address this limitation by replicating computations.
---
 csrc/scheduler/resize.cpp             |  41 +--
 csrc/scheduler/tools/resize_utils.cpp | 106 ++++++++
 csrc/scheduler/tools/resize_utils.h   |  79 ++++++
 tests/cpp/test_gpu3.cpp               |   2 -
 tests/cpp/test_resize.cpp             | 361 +++++++++++++++++++++++++-
 5 files changed, 543 insertions(+), 46 deletions(-)
diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index fc96bd3db67..194087b90e8 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -71,40 +71,19 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
   IdModel id_model(fusion, /*build_graphs=*/false);
   const auto& broadcast_graph = id_model.buildBroadcastGraph();
 
-  // For now, only a single resize op is allowed to exist.
   auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
-  if (resize_based_tensor_ops.size() != 1) {
-    scheduler_debug_utils::canScheduleRejectReason(
-        schedulerType(), "Only a single resize op is allowed.");
-    return false;
-  }
 
-  auto resize_out_tv =
-      resize_based_tensor_ops.at(0)->output(0)->as<TensorView>();
-
-  auto all_dep_vals = DependencyCheck::getAllValsBetween(
-      {fusion->inputs().begin(), fusion->inputs().end()}, {resize_out_tv});
-  for (auto tv : ir_utils::filterByType<TensorView>(all_dep_vals)) {
-    if (tv == resize_out_tv) {
-      continue;
-    }
-    if (tv->isFusionOutput()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          schedulerType(),
-          "Dependency to fusion output not allowed: ",
-          tv->toString());
-      return false;
-    }
-    for (auto consumer_of_tv : ir_utils::consumerTvsOf(tv)) {
-      if (std::find(all_dep_vals.begin(), all_dep_vals.end(), consumer_of_tv) ==
-          all_dep_vals.end()) {
-        scheduler_debug_utils::canScheduleRejectReason(
-            schedulerType(),
-            "Resize inputs must be exclusively consumed by resize: ",
-            consumer_of_tv->toString());
-        return false;
-      }
+  if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
+          resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
+      !non_exclusive_resizes.empty()) {
+    std::stringstream msg;
+    msg << "Propagation of resizes would affect fusion outputs.";
+    for (const auto& [tv, resize_ids] : non_exclusive_resizes) {
+      msg << " Resize input tv: " << tv->toString()
+          << ", resize input ID groups: " << nvfuser::toString(resize_ids);
     }
+    scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str());
+    return false;
   }
 
   // Slicing of or to a broadcast ID is not allowed yet.
diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index cc914e5684b..c812812b905 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -66,5 +66,111 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
   }
 }
 
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph) {
+  NVF_ERROR(!ordered_resize_tensor_ops.empty());
+  Fusion* fusion = ordered_resize_tensor_ops[0]->fusion();
+
+  std::unordered_map<TensorView*, ValGroups> non_exclusive_resizes;
+
+  std::unordered_set<Val*> inputs{
+      fusion->inputs().begin(), fusion->inputs().end()};
+
+  auto get_root_to_logical_resizes =
+      [&exact_graph](TensorView* tv) -> ValGroups {
+    // This should be only used for outputs of resize-based ops,
+    // so it should always have a root domain.
+    NVF_ERROR(tv->hasRoot());
+    auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween(
+        {tv->getRootDomain().begin(), tv->getRootDomain().end()},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    ValGroups resize_inp_ids;
+    for (auto resize :
+         ir_utils::filterByType<Resize>(out_tv_root_to_logical_exprs)) {
+      resize_inp_ids.pushBack(exact_graph.toGroup(resize->in()));
+    }
+    return resize_inp_ids;
+  };
+
+  // Traverse the ops in a topological order
+  for (Expr* resize_tensor_op : ordered_resize_tensor_ops) {
+    auto inp_tv = dynamic_cast<TensorView*>(resize_tensor_op->inputs().at(0));
+    auto out_tv = dynamic_cast<TensorView*>(resize_tensor_op->outputs().at(0));
+
+    ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv);
+    NVF_ERROR(!resize_inp_ids.empty());
+
+    auto dep_vals =
+        DependencyCheck::getAllValsBetween(inputs, std::vector<Val*>{inp_tv});
+
+    // For each tensor that inp_tv depends on, check if the resize op
+    // is considered non-exclusive with respect to the tensor. That
+    // is, if propagation of the resize may result in externally
+    // visible changes through the tensor, the resize is considered
+    // non-exclusive.
+    for (auto dep_tv : ir_utils::filterByType<TensorView>(dep_vals)) {
+      bool maybe_non_exclusive = dep_tv->isFusionOutput();
+
+      if (!maybe_non_exclusive) {
+        // If a dependent tv has a consumer that inp_tv does not
+        // depend on, propagation of resize would escape to outputs,
+        // which needs to be avoided.
+        for (auto consumer_tv : ir_utils::consumerTvsOf(dep_tv)) {
+          // We are interested in if resized IDs are used by other tensors
+          // than out_tv
+          if (consumer_tv != out_tv &&
+              std::find(dep_vals.begin(), dep_vals.end(), consumer_tv) ==
+                  dep_vals.end()) {
+            maybe_non_exclusive = true;
+            break;
+          }
+        }
+      }
+
+      if (!maybe_non_exclusive) {
+        continue;
+      }
+
+      // dep_tv potentially is either a fusion output or it has a
+      // consumer outside of the dependency set to the resized
+      // tensor. Propagating the resize to dep_tv should be
+      // avoided. However, if the dep_tv iter domain that corresponds
+      // to the resized ID is a broadcast or there's no such ID, it
+      // should still be safe to consider the resize op exclusive as
+      // there's no iter domain to resize. For a concrete example, see
+      // ResizeSchedulerTest.PropagateMultipleSlicesToInputs4.
+      const auto inp_tv_logical_groups =
+          exact_graph.toGroups(inp_tv->getLogicalDomain());
+      const auto dep_tv_logical_groups =
+          exact_graph.toGroups(dep_tv->getLogicalDomain());
+      auto vals_between = getValsBetween<ValGraphBFS>(
+          {inp_tv_logical_groups.begin(), inp_tv_logical_groups.end()},
+          {dep_tv_logical_groups.begin(), dep_tv_logical_groups.end()},
+          exact_graph);
+
+      for (const ValGroup& resize_inp_id : resize_inp_ids) {
+        if (std::find(
+                vals_between.begin(), vals_between.end(), resize_inp_id) ==
+            vals_between.end()) {
+          // This resize can be ignored as there's no corresponding ID
+          // in the dep tv
+          continue;
+        }
+
+        // This resize input ID is not exclusively used
+        non_exclusive_resizes[inp_tv].pushBack(resize_inp_id);
+      }
+    }
+
+    // Analysis of exclusiveness until in_tv is done. Following
+    // resize-based tensor ops do not need to check the same section
+    // of the fusion and can start from out_tv.
+    inputs.insert(out_tv);
+  }
+
+  return non_exclusive_resizes;
+}
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
index cf03083ad4f..7b19062d6de 100644
--- a/csrc/scheduler/tools/resize_utils.h
+++ b/csrc/scheduler/tools/resize_utils.h
@@ -7,9 +7,12 @@
 // clang-format on
 #pragma once
 
+#include <val_graph.h>
+
 namespace nvfuser {
 
 class Expr;
+class TensorView;
 
 namespace scheduler_tools {
 
@@ -19,5 +22,81 @@ namespace scheduler_tools {
 // fusion inputs are skipped as their loop domains don't matter.
 void propagateResizeToInputs(Expr* resize_op);
 
+// Given a topologically ordered list of resize-based tensor ops such
+// as slice and pad, check if they can be propagated to fusion inputs
+// exclusively without causing any visible side effect. For example,
+// if a tensor is sliced and also is used to produce an output without
+// the slicing, the slice is considered non exclusive as the slice
+// input has the other visible consumer. Propagating the resize of the
+// slice to the slice input is invalid since the output computed from
+// the slice input depends on the full iteration space.
+//
+// For example, consider the following case:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t1 + 1
+// fusion.addOutput(t2)
+// fusion.addOutput(t3)
+//
+// In this case, propating the resize op of the slice would alter t1,
+// which would in turn affect t3, which is a fusion output. Since the
+// change would be visible due to the change of t3, this resize op is
+// considered non-exclusive.
+//
+// Consider a slightly different case as shown below:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0[1:10]
+// t2 = t0 + 1
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// Note that the slice is directly done with the fusion input. Since
+// we do not propagate resize ops to fusion inputs, this can be
+// considered exclusive. However, this is also considered
+// non-exclusive since the actual scheduling inserts a cache after t0,
+// which can cause a visible side effect if the resize is propagated.
+//
+// Another non-exclusivness comes from dependent fusion outputs. For
+// example, if a slice input depends on a fusion output, propagation
+// would alter the fusion output. Consider a case like:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10] // slice
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// If the resize op for the slice is propagated to t1, only the
+// section of [1:10] would be computed. Since that would change a
+// fusion output, the resize op is considered non-exclusive.
+//
+// When there's a chain of resize-based ops, for example:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t2[2:5]
+// t4 = t1 + 1
+// fusion.addOutput(t3)
+// fusion.addOutput(t4)
+//
+// We do not consider the second slice as non-exclusive as
+// long as the first slice is considered non-exclusive. This will be
+// important when resolving the non-exclusiveness by replication.
+//
+// The function returns a map from tensors that are input to
+// non-exclusive ops to their resize input ID groups. This map will be
+// used to resolve the non-exclusiveness by replication.
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph);
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 66087eab2f5..76d45f6de4c 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -9249,8 +9249,6 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) {
   tv1->split(0, 4);
   tv1->split(0, 8);
 
-  fusion.print();
-
   auto all_ids = tv1->domain()->allIDs();
 
   auto split2 = tv1->axis(0)->definition()->as<Split>();
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index 4db2c141dd7..beffa0fcf98 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4282,7 +4282,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) {
   }
 }
 
-TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
+TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
@@ -4368,7 +4368,12 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
-    GTEST_SKIP() << "Scheduling not yet supported";
+    // Make sure all slices are detected as exclusive
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
@@ -4387,6 +4392,296 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
   }
 }
 
+// Two horizontal slices, both of which slice the same iter domain.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  EnableOptionsGuard enable_options_guard;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = sin(tv2);
+
+  auto tv4 = sin(tv1);
+
+  auto tv5 = slice(
+      tv4,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(2L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv6 = sin(tv5);
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv6);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  // tv1 is the input of the first slice, which is not exclusive as
+  // tv1 is also a producer of tv4.
+  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv1),
+      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+
+  // Similary, tv4 is the input of the second slice, which is not exclusive as
+  // tv1 is also a producer of tv2.
+  EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv4),
+      exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)}));
+}
+
+// Non-exclusive slice due to a dependency to a fusion output
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  EnableOptionsGuard enable_options_guard;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({-1});
+  fusion.addInput(tv1);
+
+  auto tv2 = sin(tv0);
+
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv2, broadcast(tv1, {false, true}));
+
+  auto tv4 = slice(
+      tv3,
+      {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv3->getLogicalDomain().at(1)->extent()}});
+
+  auto tv5 = sin(tv4);
+
+  fusion.addOutput(tv5);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  // tv3 is the input of the slice, which is not exclusive as
+  // tv3 depends on tv2, which is a fusion output
+  EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv3),
+      exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+}
+
+// Slice input tensor depends on a fusion output, but the slice is
+// still considered exclusive as the fusion output has no
+// corresponding ID for the sliced ID. More specifically, tv2 is a
+// fusion output and has a dependency to the input of the
+// slice. However, the resize is done for the second axis of tv3,
+// for which tv2 has no corresponding ID. In this case, it should be
+// safe to do the propagation of the resize.
+//
+// Note that scheduling is not yet supported due to the existence of
+// the dependency from the slice input ID to the broadcast ID.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs4) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({shape[0]});
+  fusion.addInput(tv1);
+
+  auto tv2 = sin(tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, broadcast(tv2, {false, true}));
+
+  auto tv4 = slice(
+      tv3,
+      {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv3->getLogicalDomain().at(1)->extent()}});
+
+  auto tv5 = sin(tv4);
+
+  fusion.addOutput(tv5);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  EXPECT_TRUE(non_exclusive_resize_info.empty());
+}
+
+// Testing chained slices. Should be considered exclusive
+TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = slice(
+      tv2,
+      {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(3L), tv2->getLogicalDomain().at(1)->extent()}});
+
+  auto tv4 = sin(tv3);
+
+  fusion.addOutput(tv4);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  const bool use_scheduler = GetParam();
+
+  if (!use_scheduler) {
+    scheduler_tools::propagateResizeToInputs(tv2->definition());
+    scheduler_tools::propagateResizeToInputs(tv3->definition());
+    auto ref_tv = tv4;
+
+    // Fusion should have a uniform loop domain
+    checkLoopDomainEquivalence(ref_tv);
+
+    // Schedule the reference
+    ref_tv->flatten();
+    // For TIDx
+    ref_tv->split(0, 128);
+    // For BIDx
+    ref_tv->split(0, 4);
+
+    scheduler_tools::scheduleLoopDomainsLike(
+        fusion.allTvs(), ref_tv->getLoopDomain());
+
+    // Fusion should still have a uniform loop domain
+    checkLoopDomainEquivalence(ref_tv);
+
+    inlineMost();
+
+    // All tensors, except for fusion inputs, should be fully inlined
+    for (auto tv : fusion.allTvs()) {
+      if (tv->isFusionInput()) {
+        continue;
+      }
+      EXPECT_EQ(tv->getComputeAtPosition(), tv->nDims());
+    }
+
+    ref_tv->axis(-1)->parallelize(ParallelType::TIDx);
+    ref_tv->axis(-2)->parallelize(ParallelType::BIDx);
+
+    KernelExecutor ke;
+    ke.compile(&fusion, inputs);
+    auto outputs = ke.run(inputs);
+    testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+  } else {
+    // The two slices do not conflict
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
+
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+    testValidate(
+        executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+    FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+    EXPECT_FALSE(runtime->isSegmented());
+    const auto& heuristic_param =
+        runtime->schedulerHeuristics()->heuristicsList().front();
+    EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+    Fusion* scheduled_fusion =
+        dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
+            ->fusion();
+    checkLoopDomainEquivalence(
+        scheduled_fusion->outputs().at(0)->as<TensorView>());
+  }
+}
+
+// Testing chained slices. The first slice is considered
+// non-exclusive, but the following slice should not.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = slice(
+      tv2,
+      {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(3L), tv2->getLogicalDomain().at(1)->extent()}});
+
+  auto tv4 = sin(tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = sin(tv1);
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  // The two slices do not conflict
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+  EXPECT_EQ(non_exclusive_resize_info.size(), 1);
+  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv1),
+      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+}
+
 // RoPE-like rotation patten
 TEST_P(ResizeSchedulerTest, SliceRotateCat) {
   auto fusion_ptr = std::make_unique<Fusion>();
@@ -4451,16 +4746,6 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     // For BIDx
     ref_tv->split(0, 4);
 
-    {
-      IdModel id_model(&fusion, false);
-      id_model.buildExactGraph();
-      std::ofstream ofs("exact_graph.dot", std::ofstream::trunc);
-      auto dot_string =
-          id_model.idGraph(IdMappingMode::EXACT).toGraphvizDotGraph();
-      ofs << dot_string;
-      ofs.close();
-    }
-
     scheduler_tools::scheduleLoopDomainsLike(
         fusion.allTvs(), ref_tv->getLoopDomain(), /*update_mode=*/true);
 
@@ -4485,6 +4770,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    // tv1 is not considered exclusive as tv0 is also a consumer of
+    // tv3. Same for tv3. While the common input, tv0, is a fusion
+    // input, so it isn't actually scheduled, since a cache is
+    // inserted, which is indeed scheduled, the two slices do
+    // conflict.
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv1),
+        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv3),
+        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    // These two entries should be all the info map has.
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
+
     GTEST_SKIP() << "Scheduling not yet supported";
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
@@ -4605,6 +4910,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    // tv1 is not considered exclusive as tv0 is also a consumer of
+    // tv3. Same for tv3. While the common input, tv0, is a fusion
+    // input, so it isn't actually scheduled, since a cache is
+    // inserted, which is indeed scheduled, the two slices do
+    // conflict.
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv1),
+        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv3),
+        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    // These two entries should be all the info map has.
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
+
     GTEST_SKIP() << "Scheduling not yet supported";
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
@@ -4691,6 +5016,12 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
+
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
     testValidate(
@@ -4787,7 +5118,11 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
-    GTEST_SKIP() << "Scheduling not yet supported";
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);