From 1a370e7ec3c8a7c2fd00e8d8ed60f01e75dc7614 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Tue, 17 Dec 2024 21:18:32 -0800
Subject: [PATCH 1/9] Exclusiveness analysis

---
 csrc/scheduler/resize.cpp             |  36 +--
 csrc/scheduler/tools/resize_utils.cpp | 106 ++++++++
 csrc/scheduler/tools/resize_utils.h   |  79 ++++++
 tests/cpp/test_gpu3.cpp               |   2 -
 tests/cpp/test_resize.cpp             | 356 +++++++++++++++++++++++++-
 5 files changed, 533 insertions(+), 46 deletions(-)
diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index fc96bd3db67..ce1513f47af 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -71,42 +71,16 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
   IdModel id_model(fusion, /*build_graphs=*/false);
   const auto& broadcast_graph = id_model.buildBroadcastGraph();
 
-  // For now, only a single resize op is allowed to exist.
   auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
-  if (resize_based_tensor_ops.size() != 1) {
+
+  if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
+          resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
+      !non_exclusive_resizes.empty()) {
     scheduler_debug_utils::canScheduleRejectReason(
-        schedulerType(), "Only a single resize op is allowed.");
+        schedulerType(), "Not exclusively consumed.");
     return false;
   }
 
-  auto resize_out_tv =
-      resize_based_tensor_ops.at(0)->output(0)->as<TensorView>();
-
-  auto all_dep_vals = DependencyCheck::getAllValsBetween(
-      {fusion->inputs().begin(), fusion->inputs().end()}, {resize_out_tv});
-  for (auto tv : ir_utils::filterByType<TensorView>(all_dep_vals)) {
-    if (tv == resize_out_tv) {
-      continue;
-    }
-    if (tv->isFusionOutput()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          schedulerType(),
-          "Dependency to fusion output not allowed: ",
-          tv->toString());
-      return false;
-    }
-    for (auto consumer_of_tv : ir_utils::consumerTvsOf(tv)) {
-      if (std::find(all_dep_vals.begin(), all_dep_vals.end(), consumer_of_tv) ==
-          all_dep_vals.end()) {
-        scheduler_debug_utils::canScheduleRejectReason(
-            schedulerType(),
-            "Resize inputs must be exclusively consumed by resize: ",
-            consumer_of_tv->toString());
-        return false;
-      }
-    }
-  }
-
   // Slicing of or to a broadcast ID is not allowed yet.
   for (auto tensor_op : resize_based_tensor_ops) {
     TensorView* out_tv = tensor_op->output(0)->as<TensorView>();
diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index cc914e5684b..f1206f99676 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -66,5 +66,111 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
   }
 }
 
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph) {
+  NVF_ERROR(!ordered_resize_tensor_ops.empty());
+  Fusion* fusion = ordered_resize_tensor_ops[0]->fusion();
+
+  std::unordered_map<TensorView*, ValGroups> non_exclusive_resizes;
+
+  std::unordered_set<Val*> inputs{
+      fusion->inputs().begin(), fusion->inputs().end()};
+
+  auto get_root_to_logical_resizes =
+      [&exact_graph](TensorView* tv) -> ValGroups {
+    auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween(
+        {tv->getRootDomain().begin(), tv->getRootDomain().end()},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    ValGroups resize_inp_ids;
+    for (auto resize :
+         ir_utils::filterByType<Resize>(out_tv_root_to_logical_exprs)) {
+      resize_inp_ids.pushBack(exact_graph.toGroup(resize->in()));
+    }
+    return resize_inp_ids;
+  };
+
+  // Traverse the ops in a topological order
+  for (Expr* resize_tensor_op : ordered_resize_tensor_ops) {
+    auto inp_tv = dynamic_cast<TensorView*>(resize_tensor_op->inputs().at(0));
+    auto out_tv = dynamic_cast<TensorView*>(resize_tensor_op->outputs().at(0));
+
+    ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv);
+    NVF_ERROR(!resize_inp_ids.empty());
+
+    auto dep_vals =
+        DependencyCheck::getAllValsBetween(inputs, std::vector<Val*>{inp_tv});
+
+    // For each tensor that inp_tv depends on, check if the resize op
+    // is considered non-exclusive with respect to the tensor. That
+    // is, if propagation of the resize may result in externally
+    // visible changes through the tensor, the resize is considered
+    // non-exclusive.
+    for (auto dep_tv : ir_utils::filterByType<TensorView>(dep_vals)) {
+      bool maybe_non_exclusive = false;
+
+      if (dep_tv->isFusionOutput()) {
+        maybe_non_exclusive = true;
+      }
+
+      if (!maybe_non_exclusive) {
+        // If a dependent tv has a consumer that inp_tv does not
+        // depend on, propagation of resize would escape to outputs,
+        // which needs to be avoided.
+        for (auto consumer_tv : ir_utils::consumerTvsOf(dep_tv)) {
+          // We are interested in if resized IDs are used by other tensors
+          // than out_tv
+          if (consumer_tv != out_tv &&
+              std::find(dep_vals.begin(), dep_vals.end(), consumer_tv) ==
+                  dep_vals.end()) {
+            maybe_non_exclusive = true;
+            break;
+          }
+        }
+      }
+
+      if (!maybe_non_exclusive) {
+        continue;
+      }
+
+      // dep_tv potentially is either a fusion output or it has a
+      // consumer outside of the dependency set to the resized
+      // tensor. Propagating the resize to dep_tv should be
+      // avoided. However, if the dep_tv iter domain that corresponds
+      // to the resized ID is a broadcast or there's no such ID, it
+      // should still be safe to consider the resize op exclusive as
+      // there's no iter domain to resize.
+      const auto inp_tv_logical_groups =
+          exact_graph.toGroups(inp_tv->getLogicalDomain());
+      const auto dep_tv_logical_groups =
+          exact_graph.toGroups(dep_tv->getLogicalDomain());
+      auto vals_between = getValsBetween<ValGraphBFS>(
+          {inp_tv_logical_groups.begin(), inp_tv_logical_groups.end()},
+          {dep_tv_logical_groups.begin(), dep_tv_logical_groups.end()},
+          exact_graph);
+
+      for (const ValGroup& resize_inp_id : resize_inp_ids) {
+        if (std::find(
+                vals_between.begin(), vals_between.end(), resize_inp_id) ==
+            vals_between.end()) {
+          // This resize can be ignored as there's no corresponding ID
+          // in the dep tv
+          continue;
+        }
+
+        // This resize input ID is not exclusively used
+        non_exclusive_resizes[inp_tv].pushBack(resize_inp_id);
+      }
+    }
+
+    // Analysis of exclusiveness until in_tv is done. Following
+    // resize-based tensor ops do not need to check the same section
+    // of the fusion and can start from out_tv.
+    inputs.insert(out_tv);
+  }
+
+  return non_exclusive_resizes;
+}
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
index cf03083ad4f..1245f166c65 100644
--- a/csrc/scheduler/tools/resize_utils.h
+++ b/csrc/scheduler/tools/resize_utils.h
@@ -7,9 +7,12 @@
 // clang-format on
 #pragma once
 
+#include <val_graph.h>
+
 namespace nvfuser {
 
 class Expr;
+class TensorView;
 
 namespace scheduler_tools {
 
@@ -19,5 +22,81 @@ namespace scheduler_tools {
 // fusion inputs are skipped as their loop domains don't matter.
 void propagateResizeToInputs(Expr* resize_op);
 
+// Given a topologically ordered list of resize-based tensor ops such
+// as slice and pad, check if they can be propagated to fusion inputs
+// exclusively without causing any visible side effect. For example,
+// if a tensor is sliced and also is used to produce an output without
+// the slicing, the slice is considered non exclusive as the slice
+// input has the other visible consumer. Propagating the resize of the
+// slice to the slice input is invalid since the output computed from
+// the slice input depends on the full iteration space.
+//
+// For example, consider the following case:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t1 + 1
+// fusion.addOutput(t2)
+// fusion.addOutput(t3)
+//
+// In this case, propating the resize op of the slice would alter t1,
+// which would in turn affect t3, which is a fusion output. Since the
+// change would be visible due to the change of t3, this resie op is
+// considered non-exclusive.
+//
+// Consider a slightly different case as shown below:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0[1:10]
+// t2 = t0 + 1
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// Note that the slice is directly done with the fusion input. Since
+// we do not propagate resize ops to fusion inputs, this can be
+// considered exclusive. However, this is also considered
+// non-exclusive since the actual scheduling inserts a cache after t0,
+// which can cause a visible side effect if the resize is propagated.
+//
+// Another non-exclusivess comes from dependent fusion outputs. For
+// example, if a slice input depends on a fusion output, propation
+// would alter the fusion output. Consider a case like:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10] // slice
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// If the resize op for the slice is propagated to t1, only the
+// section of [1:10] would be computed. Since that would change a
+// fusion output, the resize op is considered non-exclusive.
+//
+// When there's a chain of resize-based ops, for example:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t2[2:5]
+// t4 = t1 + 1
+// fusion.addOutput(t3)
+// fusion.addOutput(t4)
+//
+// We do not consider the second slice as non-exclusive as
+// long as the first slice is considered non-exclusive. This will be
+// important when resolving the non-exclusiveness by replication.
+//
+// The function returns a map from tensors that are input to
+// non-exclusive ops to their resize input ID groups. This map will be
+// used to resolve the non-exclusiveness by replication.
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph);
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
index 66087eab2f5..76d45f6de4c 100644
--- a/tests/cpp/test_gpu3.cpp
+++ b/tests/cpp/test_gpu3.cpp
@@ -9249,8 +9249,6 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) {
   tv1->split(0, 4);
   tv1->split(0, 8);
 
-  fusion.print();
-
   auto all_ids = tv1->domain()->allIDs();
 
   auto split2 = tv1->axis(0)->definition()->as<Split>();
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index fb6c74512f2..1c3ca839baa 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4282,7 +4282,7 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) {
   }
 }
 
-TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
+TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
@@ -4368,7 +4368,251 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
-    GTEST_SKIP() << "Scheduling not yet supported";
+    // Make sure all slices are detected as exclusive
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
+
+    FusionExecutorCache executor_cache(std::move(fusion_ptr));
+    auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+    testValidate(
+        executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+    FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+    EXPECT_FALSE(runtime->isSegmented());
+    const auto& heuristic_param =
+        runtime->schedulerHeuristics()->heuristicsList().front();
+    EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+    Fusion* scheduled_fusion =
+        dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
+            ->fusion();
+    checkLoopDomainEquivalence(
+        scheduled_fusion->outputs().at(0)->as<TensorView>());
+  }
+}
+
+// Two horizontal slices, both of which slice the same iter domain.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  EnableOptionsGuard enable_options_guard;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = sin(tv2);
+
+  auto tv4 = sin(tv1);
+
+  auto tv5 = slice(
+      tv4,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(2L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv6 = sin(tv5);
+
+  fusion.addOutput(tv3);
+  fusion.addOutput(tv6);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  // tv1 is the input of the first slice, which is not exclusive as
+  // tv1 is also a producer of tv4.
+  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv1),
+      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+
+  // Similary, tv4 is the input of the second slice, which is not exclusive as
+  // tv1 is also a producer of tv2.
+  EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv4),
+      exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)}));
+}
+
+// Non-exclusive slice due to a dependency to a fusion output
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  EnableOptionsGuard enable_options_guard;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({-1});
+  fusion.addInput(tv1);
+
+  auto tv2 = sin(tv0);
+
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv2, broadcast(tv1, {false, true}));
+
+  auto tv4 = slice(
+      tv3,
+      {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv3->getLogicalDomain().at(1)->extent()}});
+
+  auto tv5 = sin(tv4);
+
+  fusion.addOutput(tv5);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  // tv3 is the input of the slice, which is not exclusive as
+  // tv3 depends on tv2, which is a fusion output
+  EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv3),
+      exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+}
+
+// Slice input tensor depends on a fusion output, but the slice is
+// still considered exclusive as the fusion output has no
+// corresponding ID for the sliced ID. Note that scheduling is not yet
+// supported due to the existence of the dependency from the slice input
+// ID to the broadcast ID.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs4) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = makeConcreteTensor({shape[0]});
+  fusion.addInput(tv1);
+
+  auto tv2 = sin(tv1);
+
+  fusion.addOutput(tv2);
+
+  auto tv3 = add(tv0, broadcast(tv2, {false, true}));
+
+  auto tv4 = slice(
+      tv3,
+      {{fusion.zeroVal(), tv3->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv3->getLogicalDomain().at(1)->extent()}});
+
+  auto tv5 = sin(tv4);
+
+  fusion.addOutput(tv5);
+
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  EXPECT_TRUE(non_exclusive_resize_info.empty());
+}
+
+// Testing chained slices. Should be considered exclusive
+TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = slice(
+      tv2,
+      {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(3L), tv2->getLogicalDomain().at(1)->extent()}});
+
+  auto tv4 = sin(tv3);
+
+  fusion.addOutput(tv4);
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  const bool use_scheduler = GetParam();
+
+  if (!use_scheduler) {
+    scheduler_tools::propagateResizeToInputs(tv2->definition());
+    scheduler_tools::propagateResizeToInputs(tv3->definition());
+    auto ref_tv = tv4;
+
+    // Fusion should have a uniform loop domain
+    checkLoopDomainEquivalence(ref_tv);
+
+    // Schedule the reference
+    ref_tv->flatten();
+    // For TIDx
+    ref_tv->split(0, 128);
+    // For BIDx
+    ref_tv->split(0, 4);
+
+    scheduler_tools::scheduleLoopDomainsLike(
+        fusion.allTvs(), ref_tv->getLoopDomain());
+
+    // Fusion should still have a uniform loop domain
+    checkLoopDomainEquivalence(ref_tv);
+
+    inlineMost();
+
+    // All tensors, except for fusion inputs, should be fully inlined
+    for (auto tv : fusion.allTvs()) {
+      if (tv->isFusionInput()) {
+        continue;
+      }
+      EXPECT_EQ(tv->getComputeAtPosition(), tv->nDims());
+    }
+
+    ref_tv->axis(-1)->parallelize(ParallelType::TIDx);
+    ref_tv->axis(-2)->parallelize(ParallelType::BIDx);
+
+    KernelExecutor ke;
+    ke.compile(&fusion, inputs);
+    auto outputs = ke.run(inputs);
+    testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
+  } else {
+    // The two slices do not conflict
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
@@ -4387,6 +4631,52 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs) {
   }
 }
 
+// Testing chained slices. The first slice is considered
+// non-exclusive, but the following slice should not.
+TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(1L), tv1->getLogicalDomain().at(1)->extent()}});
+
+  auto tv3 = slice(
+      tv2,
+      {{fusion.zeroVal(), tv2->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(3L), tv2->getLogicalDomain().at(1)->extent()}});
+
+  auto tv4 = sin(tv3);
+  fusion.addOutput(tv4);
+
+  auto tv5 = sin(tv1);
+  fusion.addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  // The two slices do not conflict
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+  EXPECT_EQ(non_exclusive_resize_info.size(), 1);
+  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+  EXPECT_EQ(
+      non_exclusive_resize_info.at(tv1),
+      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+}
+
 // RoPE-like rotation patten
 TEST_P(ResizeSchedulerTest, SliceRotateCat) {
   auto fusion_ptr = std::make_unique<Fusion>();
@@ -4451,16 +4741,6 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     // For BIDx
     ref_tv->split(0, 4);
 
-    {
-      IdModel id_model(&fusion, false);
-      id_model.buildExactGraph();
-      std::ofstream ofs("exact_graph.dot", std::ofstream::trunc);
-      auto dot_string =
-          id_model.idGraph(IdMappingMode::EXACT).toGraphvizDotGraph();
-      ofs << dot_string;
-      ofs.close();
-    }
-
     scheduler_tools::scheduleLoopDomainsLike(
         fusion.allTvs(), ref_tv->getLoopDomain(), /*update_mode=*/true);
 
@@ -4485,6 +4765,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    // tv1 is not considered exclusive as tv0 is also a consumer of
+    // tv3. Same for tv3. While the common input, tv0, is a fusion
+    // input, so it isn't actually scheduled, since a cache is
+    // inserted, which is indeed scheduled, the two slices do
+    // conflict.
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv1),
+        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv3),
+        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    // These two entries should be all the info map has.
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
+
     GTEST_SKIP() << "Scheduling not yet supported";
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
@@ -4605,6 +4905,26 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    // tv1 is not considered exclusive as tv0 is also a consumer of
+    // tv3. Same for tv3. While the common input, tv0, is a fusion
+    // input, so it isn't actually scheduled, since a cache is
+    // inserted, which is indeed scheduled, the two slices do
+    // conflict.
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv1),
+        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+    EXPECT_EQ(
+        non_exclusive_resize_info.at(tv3),
+        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    // These two entries should be all the info map has.
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
+
     GTEST_SKIP() << "Scheduling not yet supported";
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
@@ -4691,6 +5011,12 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
+
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
     testValidate(
@@ -4787,7 +5113,11 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) {
     auto outputs = ke.run(inputs);
     testValidate(&fusion, outputs, inputs, __LINE__, __FILE__);
   } else {
-    GTEST_SKIP() << "Scheduling not yet supported";
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_TRUE(non_exclusive_resize_info.empty());
 
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);

From ac5a1bc2f2fc551197f65444e0108d9915c00280 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Wed, 18 Dec 2024 08:48:30 -0800
Subject: [PATCH 2/9] cleanup

---
 csrc/scheduler/resize.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index ce1513f47af..194087b90e8 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -76,8 +76,13 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
   if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
           resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
       !non_exclusive_resizes.empty()) {
-    scheduler_debug_utils::canScheduleRejectReason(
-        schedulerType(), "Not exclusively consumed.");
+    std::stringstream msg;
+    msg << "Propagation of resizes would affect fusion outputs.";
+    for (const auto& [tv, resize_ids] : non_exclusive_resizes) {
+      msg << " Resize input tv: " << tv->toString()
+          << ", resize input ID groups: " << nvfuser::toString(resize_ids);
+    }
+    scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str());
     return false;
   }
 

From 8b8c708d5dcbf06770810346deedc76801e04295 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 19 Dec 2024 12:12:43 -0800
Subject: [PATCH 3/9] cleanup

---
 csrc/scheduler/tools/resize_utils.cpp | 9 ++++-----
 csrc/scheduler/tools/resize_utils.h   | 6 +++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index f1206f99676..4771a66cf31 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -79,6 +79,9 @@ std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
 
   auto get_root_to_logical_resizes =
       [&exact_graph](TensorView* tv) -> ValGroups {
+    // This should be only used for outputs of resize-based ops,
+    // so it should always have a root domain.
+    NVF_ERROR(tv->hasRoot());
     auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween(
         {tv->getRootDomain().begin(), tv->getRootDomain().end()},
         {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
@@ -107,11 +110,7 @@ std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
     // visible changes through the tensor, the resize is considered
     // non-exclusive.
     for (auto dep_tv : ir_utils::filterByType<TensorView>(dep_vals)) {
-      bool maybe_non_exclusive = false;
-
-      if (dep_tv->isFusionOutput()) {
-        maybe_non_exclusive = true;
-      }
+      bool maybe_non_exclusive = dep_tv->isFusionOutput();
 
       if (!maybe_non_exclusive) {
         // If a dependent tv has a consumer that inp_tv does not
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
index 1245f166c65..7b19062d6de 100644
--- a/csrc/scheduler/tools/resize_utils.h
+++ b/csrc/scheduler/tools/resize_utils.h
@@ -43,7 +43,7 @@ void propagateResizeToInputs(Expr* resize_op);
 //
 // In this case, propating the resize op of the slice would alter t1,
 // which would in turn affect t3, which is a fusion output. Since the
-// change would be visible due to the change of t3, this resie op is
+// change would be visible due to the change of t3, this resize op is
 // considered non-exclusive.
 //
 // Consider a slightly different case as shown below:
@@ -61,8 +61,8 @@ void propagateResizeToInputs(Expr* resize_op);
 // non-exclusive since the actual scheduling inserts a cache after t0,
 // which can cause a visible side effect if the resize is propagated.
 //
-// Another non-exclusivess comes from dependent fusion outputs. For
-// example, if a slice input depends on a fusion output, propation
+// Another non-exclusivness comes from dependent fusion outputs. For
+// example, if a slice input depends on a fusion output, propagation
 // would alter the fusion output. Consider a case like:
 //
 // t0 = makeSymbolicTensor(1)

From d364442808cce87532d24fe13b5b755a6d2ee25b Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 19 Dec 2024 12:26:32 -0800
Subject: [PATCH 4/9] PR feedback

---
 csrc/scheduler/tools/resize_utils.cpp |  3 ++-
 tests/cpp/test_resize.cpp             | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index 4771a66cf31..c812812b905 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -138,7 +138,8 @@ std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
       // avoided. However, if the dep_tv iter domain that corresponds
       // to the resized ID is a broadcast or there's no such ID, it
       // should still be safe to consider the resize op exclusive as
-      // there's no iter domain to resize.
+      // there's no iter domain to resize. For a concrete example, see
+      // ResizeSchedulerTest.PropagateMultipleSlicesToInputs4.
       const auto inp_tv_logical_groups =
           exact_graph.toGroups(inp_tv->getLogicalDomain());
       const auto dep_tv_logical_groups =
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index 1c3ca839baa..f0a35dee3a7 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4496,9 +4496,14 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
 
 // Slice input tensor depends on a fusion output, but the slice is
 // still considered exclusive as the fusion output has no
-// corresponding ID for the sliced ID. Note that scheduling is not yet
-// supported due to the existence of the dependency from the slice input
-// ID to the broadcast ID.
+// corresponding ID for the sliced ID. More specifically, tv2 is a
+// fusion output and has a dependency to the input of the
+// slice. However, the resize is done for the second axis of tv3,
+// for which tv2 has no corresponding ID. In this case, it should be
+// safe to do the propagation of the resize.
+//
+// Note that scheduling is not yet supported due to the existence of
+// the dependency from the slice input ID to the broadcast ID.
 TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs4) {
   auto fusion_ptr = std::make_unique<Fusion>();
   Fusion& fusion = *fusion_ptr;

From 7380a40b45003b5b42336479f2ca6e61245c249b Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 19 Dec 2024 12:54:09 -0800
Subject: [PATCH 5/9] Resolve conflicts by recomputation

---
 csrc/scheduler/resize.cpp             |  61 ++++++--
 csrc/scheduler/tools/resize_utils.cpp |  14 +-
 csrc/scheduler/tools/resize_utils.h   |  16 +-
 tests/cpp/test_resize.cpp             | 208 ++++++++++++++++++++------
 4 files changed, 235 insertions(+), 64 deletions(-)

diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index 194087b90e8..dece9f79238 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -73,19 +73,6 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
 
   auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
 
-  if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
-          resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
-      !non_exclusive_resizes.empty()) {
-    std::stringstream msg;
-    msg << "Propagation of resizes would affect fusion outputs.";
-    for (const auto& [tv, resize_ids] : non_exclusive_resizes) {
-      msg << " Resize input tv: " << tv->toString()
-          << ", resize input ID groups: " << nvfuser::toString(resize_ids);
-    }
-    scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str());
-    return false;
-  }
-
   // Slicing of or to a broadcast ID is not allowed yet.
   for (auto tensor_op : resize_based_tensor_ops) {
     TensorView* out_tv = tensor_op->output(0)->as<TensorView>();
@@ -133,6 +120,30 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
     return false;
   }
 
+  for (auto out_tv : ir_utils::filterByType<TensorView>(fusion->outputs())) {
+    if (out_tv == ref_tv) {
+      continue;
+    }
+    auto exprs = ValGraphBFS::getExprGroupsBetween(
+                     broadcast_graph,
+                     broadcast_graph.toGroups(ref_tv->getLogicalDomain()),
+                     broadcast_graph.toGroups(out_tv->getLogicalDomain()),
+                     /*require_all_to_visited=*/false)
+                     .first;
+    for (const auto& [expr_g, dir] : exprs) {
+      if (expr_g->front()->isA<Resize>()) {
+        std::stringstream msg;
+        msg << "Resize between reference and output not allowed.";
+        msg << " Reference: " << ref_tv->toString()
+            << ". Output: " << out_tv->toString()
+            << ". Resize: " << expr_g->front()->toString();
+        scheduler_debug_utils::canScheduleRejectReason(
+            schedulerType(), msg.str());
+        return false;
+      }
+    }
+  }
+
   // Disable the scheduler if there's a squeeze op. The loop option
   // may also need to be enabled in that case, but that option is not
   // turned on automatically yet.
@@ -163,6 +174,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
   scheduler_utils::cacheInputs(fusion, true);
   scheduler_utils::cacheAndForkOutputs(fusion, true);
 
+  auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
+
+  IdModel id_model(fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+
+  // Replicate resize inputs if necessary to avoid conflicting propagations
+  for (const auto& [out_tv, exlusivity_info] :
+       scheduler_tools::getNonExclusiveResizeInfo(
+           resize_based_tensor_ops, exact_graph)) {
+    auto resize_based_op = out_tv->definition();
+    auto inp_tv = resize_based_op->input(0)->as<TensorView>();
+    auto inp_tv_copy = RecomputeTv::recompute(inp_tv);
+    ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy);
+  }
+
   for (auto expr : fusion->exprs()) {
     if (!expr->isOneOf<SliceOp, PadOp>()) {
       continue;
@@ -186,9 +212,14 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
   ref_tv->axis(-1)->parallelize(ParallelType::TIDx);
   ref_tv->axis(-2)->parallelize(ParallelType::BIDx);
 
-  // Propagate the reference to the other tensors
+  // Propagate the reference to the other tensors. Note that the
+  // update flag is enabled so to workaround the resize propagation
+  // issue. This may not work if there's a tensor that is reshaped
+  // from the reference tensor, but that should not be the case as the
+  // reference is picked by the same routine used for the pointwise
+  // scheduler.
   scheduler_tools::scheduleLoopDomainsLike(
-      fusion->allTvs(), ref_tv->getLoopDomain());
+      fusion->allTvs(), ref_tv->getLoopDomain(), true);
 
   inlineMost();
 
diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index c812812b905..26ec36fa8ec 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -66,13 +66,13 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
   }
 }
 
-std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+std::unordered_map<TensorView*, ResizeExclusivityInfo> getNonExclusiveResizeInfo(
     const std::vector<Expr*>& ordered_resize_tensor_ops,
     const ValGraph& exact_graph) {
   NVF_ERROR(!ordered_resize_tensor_ops.empty());
   Fusion* fusion = ordered_resize_tensor_ops[0]->fusion();
 
-  std::unordered_map<TensorView*, ValGroups> non_exclusive_resizes;
+  std::unordered_map<TensorView*, ResizeExclusivityInfo> non_exclusive_resizes;
 
   std::unordered_set<Val*> inputs{
       fusion->inputs().begin(), fusion->inputs().end()};
@@ -98,6 +98,8 @@ std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
     auto inp_tv = dynamic_cast<TensorView*>(resize_tensor_op->inputs().at(0));
     auto out_tv = dynamic_cast<TensorView*>(resize_tensor_op->outputs().at(0));
 
+    ResizeExclusivityInfo info;
+
     ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv);
     NVF_ERROR(!resize_inp_ids.empty());
 
@@ -159,10 +161,16 @@ std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
         }
 
         // This resize input ID is not exclusively used
-        non_exclusive_resizes[inp_tv].pushBack(resize_inp_id);
+        // non_exclusive_resizes[inp_tv].first.pushBack(resize_inp_id);
+        info.shared_tvs.push_back(dep_tv);
+        info.resized_ids.pushBack(resize_inp_id);
       }
     }
 
+    if (!info.shared_tvs.empty()) {
+      NVF_ERROR(non_exclusive_resizes.emplace(out_tv, info).second);
+    }
+
     // Analysis of exclusiveness until in_tv is done. Following
     // resize-based tensor ops do not need to check the same section
     // of the fusion and can start from out_tv.
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
index 7b19062d6de..b3704bb7326 100644
--- a/csrc/scheduler/tools/resize_utils.h
+++ b/csrc/scheduler/tools/resize_utils.h
@@ -94,7 +94,21 @@ void propagateResizeToInputs(Expr* resize_op);
 // The function returns a map from tensors that are input to
 // non-exclusive ops to their resize input ID groups. This map will be
 // used to resolve the non-exclusiveness by replication.
-std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+struct ResizeExclusivityInfo {
+  std::vector<TensorView*> shared_tvs;
+  // std::unordered_map<TensorView*, ValGroups> resized_ids;
+  ValGroups resized_ids;
+
+  bool operator==(const ResizeExclusivityInfo& other) const {
+    return shared_tvs == other.shared_tvs && resized_ids == other.resized_ids;
+  }
+
+  bool operator!=(const ResizeExclusivityInfo& other) const {
+    return !(*this == other);
+  }
+};
+
+std::unordered_map<TensorView*, ResizeExclusivityInfo> getNonExclusiveResizeInfo(
     const std::vector<Expr*>& ordered_resize_tensor_ops,
     const ValGraph& exact_graph);
 
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index f0a35dee3a7..74e44c31841 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4427,25 +4427,83 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) {
   fusion.addOutput(tv3);
   fusion.addOutput(tv6);
 
-  IdModel id_model(&fusion, /*build_graphs=*/false);
-  const auto& exact_graph = id_model.buildExactGraph();
+  {
+    IdModel id_model(&fusion, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
 
-  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
-      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
 
-  // tv1 is the input of the first slice, which is not exclusive as
-  // tv1 is also a producer of tv4.
-  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
-  EXPECT_EQ(
-      non_exclusive_resize_info.at(tv1),
-      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+    // tv2 is the output of the first slice, which is not exclusive as
+    // tv1 is also a producer of tv4.
+    EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+    scheduler_tools::ResizeExclusivityInfo tv2_info{
+        {tv1}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+
+    // Similary, tv5 is the output of the second slice, which is not exclusive
+    // as tv1 is also a producer of tv2.
+    EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1);
+    scheduler_tools::ResizeExclusivityInfo tv5_info{
+        {tv1}, exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info);
+  }
 
-  // Similary, tv4 is the input of the second slice, which is not exclusive as
-  // tv1 is also a producer of tv2.
-  EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
-  EXPECT_EQ(
-      non_exclusive_resize_info.at(tv4),
-      exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)}));
+  // Test replication-based mitigation of conflicts
+  {
+    Fusion fusion_copy = fusion;
+    FusionGuard fg(&fusion_copy);
+
+    auto tv0 = fusion_copy.inputs().at(0)->as<TensorView>();
+    auto tv2 =
+        fusion_copy.outputs().at(0)->definition()->input(0)->as<TensorView>();
+    auto slice = dynamic_cast<SliceOp*>(tv2->definition());
+    ASSERT_NE(slice, nullptr);
+    auto tv1 = slice->input(0)->as<TensorView>();
+    auto tv5 =
+        fusion_copy.outputs().at(1)->definition()->input(0)->as<TensorView>();
+    auto tv4 = tv5->definition()->input(0)->as<TensorView>();
+
+    // Replicate tv1 for tv2
+    auto private_copy = RecomputeTv::recompute(tv1);
+    ir_utils::replaceValInExprInputs(slice, tv1, private_copy);
+
+    // The two slices should still be reported as non-exclusive but they
+    // both are shared at the fusion input.
+    IdModel id_model(&fusion_copy, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion_copy), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.size(), 2);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+    scheduler_tools::ResizeExclusivityInfo tv2_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv0->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+
+    EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1);
+    scheduler_tools::ResizeExclusivityInfo tv5_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info);
+  }
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+  testValidate(
+      executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  EXPECT_FALSE(runtime->isSegmented());
+  const auto& heuristic_param =
+      runtime->schedulerHeuristics()->heuristicsList().front();
+  EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+  Fusion* scheduled_fusion =
+      dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
+  checkLoopDomainEquivalence(
+      scheduled_fusion->outputs().at(0)->as<TensorView>());
 }
 
 // Non-exclusive slice due to a dependency to a fusion output
@@ -4486,12 +4544,57 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
   auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
       ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
 
-  // tv3 is the input of the slice, which is not exclusive as
+  // tv4 is the input of the slice, which is not exclusive as
   // tv3 depends on tv2, which is a fusion output
-  EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
-  EXPECT_EQ(
-      non_exclusive_resize_info.at(tv3),
-      exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+  EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+  scheduler_tools::ResizeExclusivityInfo tv4_info{
+      {tv2}, exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info);
+
+  // Test replication-based mitigation of conflicts
+  {
+    Fusion fusion_copy = fusion;
+    FusionGuard fg(&fusion_copy);
+
+    auto tv0 = fusion_copy.inputs().at(0)->as<TensorView>();
+    auto tv5 = fusion_copy.outputs().at(1)->as<TensorView>();
+    auto tv4 = tv5->definition()->input(0)->as<TensorView>();
+    auto tv3 = tv4->definition()->input(0)->as<TensorView>();
+
+    auto private_copy = RecomputeTv::recompute(tv3);
+    ir_utils::replaceValInExprInputs(tv4->definition(), tv3, private_copy);
+
+    IdModel id_model(&fusion_copy, /*build_graphs=*/false);
+    const auto& exact_graph = id_model.buildExactGraph();
+    auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+        ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion_copy), exact_graph);
+    EXPECT_EQ(non_exclusive_resize_info.size(), 1);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+    scheduler_tools::ResizeExclusivityInfo tv4_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv0->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info);
+  }
+
+  GTEST_SKIP() << "Scheduling not yet supported due to broadcast";
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  auto t1 = at::randn({16}, options);
+  std::vector<c10::IValue> inputs({t0, t1});
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+  testValidate(
+      executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  EXPECT_FALSE(runtime->isSegmented());
+  const auto& heuristic_param =
+      runtime->schedulerHeuristics()->heuristicsList().front();
+  EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+  Fusion* scheduled_fusion =
+      dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
+  checkLoopDomainEquivalence(
+      scheduled_fusion->outputs().at(0)->as<TensorView>());
 }
 
 // Slice input tensor depends on a fusion output, but the slice is
@@ -4676,10 +4779,29 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs6) {
   auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
       ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
   EXPECT_EQ(non_exclusive_resize_info.size(), 1);
-  EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
-  EXPECT_EQ(
-      non_exclusive_resize_info.at(tv1),
-      exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
+  EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+  scheduler_tools::ResizeExclusivityInfo tv2_info{
+      {tv1}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+
+  // When scheduled, since the shape of the tv4 is different from the
+  // shape of tv5, this fusion is segmented. One segment is a resize
+  // segment consisting of tv2 and tv3 slices. Another is a pointwise
+  // segment for tv5.
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+  testValidate(
+      executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  const auto& heuristic_list = runtime->schedulerHeuristics()->heuristicsList();
+  EXPECT_EQ(heuristic_list.size(), 2);
+  // They should be a combination of a resize scheduler and a pointwise
+  // scheduler
+  EXPECT_TRUE(
+      (heuristic_list[0]->scheduler_type == SchedulerType::PointWise &&
+       heuristic_list[1]->scheduler_type == SchedulerType::Resize) ||
+      (heuristic_list[0]->scheduler_type == SchedulerType::Resize &&
+       heuristic_list[1]->scheduler_type == SchedulerType::PointWise));
 }
 
 // RoPE-like rotation patten
@@ -4779,19 +4901,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     const auto& exact_graph = id_model.buildExactGraph();
     auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
         ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
-    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
-    EXPECT_EQ(
-        non_exclusive_resize_info.at(tv1),
-        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
-    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
-    EXPECT_EQ(
-        non_exclusive_resize_info.at(tv3),
-        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+    scheduler_tools::ResizeExclusivityInfo tv2_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+    scheduler_tools::ResizeExclusivityInfo tv4_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info);
     // These two entries should be all the info map has.
     EXPECT_EQ(non_exclusive_resize_info.size(), 2);
 
-    GTEST_SKIP() << "Scheduling not yet supported";
-
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
     testValidate(
@@ -4919,19 +5039,17 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
     const auto& exact_graph = id_model.buildExactGraph();
     auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
         ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
-    EXPECT_EQ(non_exclusive_resize_info.count(tv1), 1);
-    EXPECT_EQ(
-        non_exclusive_resize_info.at(tv1),
-        exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)}));
-    EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
-    EXPECT_EQ(
-        non_exclusive_resize_info.at(tv3),
-        exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)}));
+    EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+    scheduler_tools::ResizeExclusivityInfo tv2_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+    EXPECT_EQ(non_exclusive_resize_info.count(tv4), 1);
+    scheduler_tools::ResizeExclusivityInfo tv4_info{
+        {tv0}, exact_graph.toGroups(std::vector<Val*>{tv3->axis(1)})};
+    EXPECT_EQ(non_exclusive_resize_info.at(tv4), tv4_info);
     // These two entries should be all the info map has.
     EXPECT_EQ(non_exclusive_resize_info.size(), 2);
 
-    GTEST_SKIP() << "Scheduling not yet supported";
-
     FusionExecutorCache executor_cache(std::move(fusion_ptr));
     auto out_tensors = executor_cache.runFusionWithInputs(inputs);
     testValidate(

From 9631958ef7c0119f356492b0768a6891218c9acf Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 19 Dec 2024 21:11:31 -0800
Subject: [PATCH 6/9] test fix

---
 tests/cpp/test_resize.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index 6013d385018..c7a70d928b4 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -4495,15 +4495,15 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs2) {
   auto out_tensors = executor_cache.runFusionWithInputs(inputs);
   testValidate(
       executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+
+  // While the slices can be transformed to be all exclusive, it is
+  // currently segmented as the output has differet shapes. Both
+  // segments should be scheduled as resize segments.
   FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
-  EXPECT_FALSE(runtime->isSegmented());
-  const auto& heuristic_param =
-      runtime->schedulerHeuristics()->heuristicsList().front();
-  EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-  Fusion* scheduled_fusion =
-      dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())->fusion();
-  checkLoopDomainEquivalence(
-      scheduled_fusion->outputs().at(0)->as<TensorView>());
+  const auto& heuristic_list = runtime->schedulerHeuristics()->heuristicsList();
+  EXPECT_EQ(heuristic_list.size(), 2);
+  EXPECT_EQ(heuristic_list[0]->scheduler_type, SchedulerType::Resize);
+  EXPECT_EQ(heuristic_list[1]->scheduler_type, SchedulerType::Resize);
 }
 
 // Non-exclusive slice due to a dependency to a fusion output

From 76dbab97b4fdcc739f2ed1269e11e2669cee53ff Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Thu, 19 Dec 2024 22:43:00 -0800
Subject: [PATCH 7/9] fix

---
 csrc/scheduler/resize.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index dece9f79238..408c662d8aa 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -185,6 +185,12 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
            resize_based_tensor_ops, exact_graph)) {
     auto resize_based_op = out_tv->definition();
     auto inp_tv = resize_based_op->input(0)->as<TensorView>();
+    // Since cacheInput may skip caching if an input is used by
+    // slice/pad, inp_tv may be a fusion input, in which case it is
+    // not necessary to recompute the tensor.
+    if (inp_tv->isFusionInput()) {
+      continue;
+    }
     auto inp_tv_copy = RecomputeTv::recompute(inp_tv);
     ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy);
   }

From 75338a4f18a77b79d1420a8e4ca3a765a3a713ea Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Fri, 20 Dec 2024 10:15:47 -0800
Subject: [PATCH 8/9] cleanup

---
 csrc/scheduler/resize.cpp             | 14 ++++++++++++++
 csrc/scheduler/tools/resize_utils.cpp |  4 ++--
 csrc/scheduler/tools/resize_utils.h   | 12 +++++++-----
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index 408c662d8aa..b9772c507eb 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -120,6 +120,20 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
     return false;
   }
 
+  // Having different resizes between outputs is not allowed at this
+  // moment. For example, consider a fusion like:
+  //
+  // t0 = [i0]
+  // fusion.addInput(t0)
+  // t1 = t0[:i0/2]
+  // t2 = t0[i0/2:]
+  // fusion.addOutput(t1)
+  // fusion.addOutput(t2)
+  //
+  // For now, this is not going to be fused since t1 and t2 have
+  // different resize ops, although in this case, since the extents of t1 and
+  // t2 are the same, it should be relatively straightforward to fuse them
+  // together.
   for (auto out_tv : ir_utils::filterByType<TensorView>(fusion->outputs())) {
     if (out_tv == ref_tv) {
       continue;
diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
index cb9ecd94c6d..fc8c0cc5f09 100644
--- a/csrc/scheduler/tools/resize_utils.cpp
+++ b/csrc/scheduler/tools/resize_utils.cpp
@@ -161,12 +161,12 @@ std::unordered_map<TensorView*, ResizeExclusivityInfo> getNonExclusiveResizeInfo
         }
 
         // This resize input ID is not exclusively used
-        info.shared_tvs.push_back(dep_tv);
+        info.non_exclusive_dep_tvs.push_back(dep_tv);
         info.resized_ids.pushBack(resize_inp_id);
       }
     }
 
-    if (!info.shared_tvs.empty()) {
+    if (!info.non_exclusive_dep_tvs.empty()) {
       NVF_ERROR(non_exclusive_resizes.emplace(out_tv, info).second);
     }
 
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
index b3704bb7326..b9afed5effa 100644
--- a/csrc/scheduler/tools/resize_utils.h
+++ b/csrc/scheduler/tools/resize_utils.h
@@ -91,16 +91,18 @@ void propagateResizeToInputs(Expr* resize_op);
 // long as the first slice is considered non-exclusive. This will be
 // important when resolving the non-exclusiveness by replication.
 //
-// The function returns a map from tensors that are input to
-// non-exclusive ops to their resize input ID groups. This map will be
+// The function returns a map from tensors that are outputs to
+// non-exclusive ops to ResizeExclusivityInfo. This map will be
 // used to resolve the non-exclusiveness by replication.
 struct ResizeExclusivityInfo {
-  std::vector<TensorView*> shared_tvs;
-  // std::unordered_map<TensorView*, ValGroups> resized_ids;
+  // Dependent tensors that should not be resized
+  std::vector<TensorView*> non_exclusive_dep_tvs;
+  // ID groups of resize input IDs
   ValGroups resized_ids;
 
   bool operator==(const ResizeExclusivityInfo& other) const {
-    return shared_tvs == other.shared_tvs && resized_ids == other.resized_ids;
+    return non_exclusive_dep_tvs == other.non_exclusive_dep_tvs &&
+        resized_ids == other.resized_ids;
   }
 
   bool operator!=(const ResizeExclusivityInfo& other) const {

From e48a2f6274348cee5cdeca8d445bb7535b3bc0c4 Mon Sep 17 00:00:00 2001
From: Naoya Maruyama <nmaruyama@nvidia.com>
Date: Mon, 23 Dec 2024 17:57:26 -0800
Subject: [PATCH 9/9] Recomputation needs to be done in a topological order

---
 csrc/scheduler/resize.cpp | 26 ++++++-----
 tests/cpp/test_resize.cpp | 98 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 11 deletions(-)

diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
index b9772c507eb..43a7195fe20 100644
--- a/csrc/scheduler/resize.cpp
+++ b/csrc/scheduler/resize.cpp
@@ -71,11 +71,11 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
   IdModel id_model(fusion, /*build_graphs=*/false);
   const auto& broadcast_graph = id_model.buildBroadcastGraph();
 
-  auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
+  auto resize_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
 
   // Slicing of or to a broadcast ID is not allowed yet.
-  for (auto tensor_op : resize_based_tensor_ops) {
-    TensorView* out_tv = tensor_op->output(0)->as<TensorView>();
+  for (auto resize_tensor_op : resize_tensor_ops) {
+    TensorView* out_tv = resize_tensor_op->output(0)->as<TensorView>();
     for (auto logical_id : out_tv->getLogicalDomain()) {
       Resize* resize = dynamic_cast<Resize*>(logical_id->definition());
       if (resize == nullptr) {
@@ -188,17 +188,21 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
   scheduler_utils::cacheInputs(fusion, true);
   scheduler_utils::cacheAndForkOutputs(fusion, true);
 
-  auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
+  auto resize_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
 
   IdModel id_model(fusion, /*build_graphs=*/false);
   const auto& exact_graph = id_model.buildExactGraph();
 
-  // Replicate resize inputs if necessary to avoid conflicting propagations
-  for (const auto& [out_tv, exlusivity_info] :
-       scheduler_tools::getNonExclusiveResizeInfo(
-           resize_based_tensor_ops, exact_graph)) {
-    auto resize_based_op = out_tv->definition();
-    auto inp_tv = resize_based_op->input(0)->as<TensorView>();
+  // Replicate resize inputs if necessary to avoid conflicting
+  // propagations
+  const auto exclusivity_info_map = scheduler_tools::getNonExclusiveResizeInfo(
+      resize_tensor_ops, exact_graph);
+  for (auto resize_tensor_op : resize_tensor_ops) {
+    auto out_tv = resize_tensor_op->output(0)->as<TensorView>();
+    if (exclusivity_info_map.count(out_tv) == 0) {
+      continue;
+    }
+    auto inp_tv = resize_tensor_op->input(0)->as<TensorView>();
     // Since cacheInput may skip caching if an input is used by
     // slice/pad, inp_tv may be a fusion input, in which case it is
     // not necessary to recompute the tensor.
@@ -206,7 +210,7 @@ void ResizeScheduler::schedule(Fusion* fusion, const HeuristicParams* params) {
       continue;
     }
     auto inp_tv_copy = RecomputeTv::recompute(inp_tv);
-    ir_utils::replaceValInExprInputs(resize_based_op, inp_tv, inp_tv_copy);
+    ir_utils::replaceValInExprInputs(resize_tensor_op, inp_tv, inp_tv_copy);
   }
 
   for (auto expr : fusion->exprs()) {
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
index a4710d293c8..587f72143a4 100644
--- a/tests/cpp/test_resize.cpp
+++ b/tests/cpp/test_resize.cpp
@@ -5123,6 +5123,104 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
   }
 }
 
+// Rotate twice. Resolving the non-exclusivity must be done in a
+// topological order.
+TEST_F(ResizeSchedulerTest, SliceRotateCatTwice) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape({-1, 100});
+
+  EnableOptionsGuard enable_options_guard;
+  EnableOptionsGuard::getCurOptions().set(EnableOption::IdModel, {"all"});
+
+  auto tv0 = makeConcreteTensor(shape);
+  fusion.addInput(tv0);
+
+  auto tv1 = sin(tv0);
+
+  auto tv2 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {fusion.zeroVal(), IrBuilder::create<Val>(shape[1] / 2)}});
+
+  auto tv3 = slice(
+      tv1,
+      {{fusion.zeroVal(), tv1->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(shape[1] / 2),
+        IrBuilder::create<Val>(shape[1])}});
+
+  auto tv4 = cat({tv3, tv2}, -1);
+
+  auto tv5 = slice(
+      tv4,
+      {{fusion.zeroVal(), tv4->getLogicalDomain().at(0)->extent()},
+       {fusion.zeroVal(), IrBuilder::create<Val>(shape[1] / 2)}});
+
+  auto tv6 = slice(
+      tv4,
+      {{fusion.zeroVal(), tv4->getLogicalDomain().at(0)->extent()},
+       {IrBuilder::create<Val>(shape[1] / 2),
+        IrBuilder::create<Val>(shape[1])}});
+
+  auto tv7 = cat({tv6, tv5}, -1);
+
+  fusion.addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({16, 100}, options);
+  std::vector<c10::IValue> inputs({t0});
+
+  // tv1 is not considered exclusive as tv0 is also a consumer of
+  // tv3. Same for tv3. While the common input, tv0, is a fusion
+  // input, so it isn't actually scheduled, since a cache is
+  // inserted, which is indeed scheduled, the two slices do
+  // conflict.
+  IdModel id_model(&fusion, /*build_graphs=*/false);
+  const auto& exact_graph = id_model.buildExactGraph();
+  auto non_exclusive_resize_info = scheduler_tools::getNonExclusiveResizeInfo(
+      ir_utils::getOpsOfType<SliceOp, PadOp>(&fusion), exact_graph);
+
+  // tv2
+  EXPECT_EQ(non_exclusive_resize_info.count(tv2), 1);
+  scheduler_tools::ResizeExclusivityInfo tv2_info{
+      {tv1}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv2), tv2_info);
+
+  // tv3
+  EXPECT_EQ(non_exclusive_resize_info.count(tv3), 1);
+  scheduler_tools::ResizeExclusivityInfo tv3_info{
+      {tv1}, exact_graph.toGroups(std::vector<Val*>{tv1->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv3), tv3_info);
+
+  // tv5
+  EXPECT_EQ(non_exclusive_resize_info.count(tv5), 1);
+  scheduler_tools::ResizeExclusivityInfo tv5_info{
+      {tv4}, exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv5), tv5_info);
+
+  // tv6
+  EXPECT_EQ(non_exclusive_resize_info.count(tv6), 1);
+  scheduler_tools::ResizeExclusivityInfo tv6_info{
+      {tv4}, exact_graph.toGroups(std::vector<Val*>{tv4->axis(1)})};
+  EXPECT_EQ(non_exclusive_resize_info.at(tv6), tv6_info);
+
+  // These should be all the info the map has.
+  EXPECT_EQ(non_exclusive_resize_info.size(), 4);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto out_tensors = executor_cache.runFusionWithInputs(inputs);
+  testValidate(
+      executor_cache.fusion(), out_tensors, inputs, __LINE__, __FILE__);
+
+  FusionKernelRuntime* runtime = executor_cache.getMostRecentKernelRuntime();
+  EXPECT_FALSE(runtime->isSegmented());
+  const auto& heuristic_param =
+      runtime->schedulerHeuristics()->heuristicsList().front();
+  EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
+}
+
 // Consumer-based scheduling of pad
 TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
   auto fusion_ptr = std::make_unique<Fusion>();