Merge branch 'main' into fp8_enable_on_sm89

jjsjann123 · web-flow · commit 48e620fe5ff2 · 2024-12-20T10:38:04.000-08:00
diff --git a/csrc/scheduler/resize.cpp b/csrc/scheduler/resize.cpp
@@ -71,40 +71,19 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
   IdModel id_model(fusion, /*build_graphs=*/false);
   const auto& broadcast_graph = id_model.buildBroadcastGraph();
 
-  // For now, only a single resize op is allowed to exist.
   auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
-  if (resize_based_tensor_ops.size() != 1) {
-    scheduler_debug_utils::canScheduleRejectReason(
-        schedulerType(), "Only a single resize op is allowed.");
-    return false;
-  }
 
-  auto resize_out_tv =
-      resize_based_tensor_ops.at(0)->output(0)->as<TensorView>();
-
-  auto all_dep_vals = DependencyCheck::getAllValsBetween(
-      {fusion->inputs().begin(), fusion->inputs().end()}, {resize_out_tv});
-  for (auto tv : ir_utils::filterByType<TensorView>(all_dep_vals)) {
-    if (tv == resize_out_tv) {
-      continue;
-    }
-    if (tv->isFusionOutput()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          schedulerType(),
-          "Dependency to fusion output not allowed: ",
-          tv->toString());
-      return false;
-    }
-    for (auto consumer_of_tv : ir_utils::consumerTvsOf(tv)) {
-      if (std::find(all_dep_vals.begin(), all_dep_vals.end(), consumer_of_tv) ==
-          all_dep_vals.end()) {
-        scheduler_debug_utils::canScheduleRejectReason(
-            schedulerType(),
-            "Resize inputs must be exclusively consumed by resize: ",
-            consumer_of_tv->toString());
-        return false;
-      }
+  if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
+          resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
+      !non_exclusive_resizes.empty()) {
+    std::stringstream msg;
+    msg << "Propagation of resizes would affect fusion outputs.";
+    for (const auto& [tv, resize_ids] : non_exclusive_resizes) {
+      msg << " Resize input tv: " << tv->toString()
+          << ", resize input ID groups: " << nvfuser::toString(resize_ids);
     }
+    scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str());
+    return false;
   }
 
   // Slicing of or to a broadcast ID is not allowed yet.
diff --git a/csrc/scheduler/tools/resize_utils.cpp b/csrc/scheduler/tools/resize_utils.cpp
@@ -66,5 +66,111 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
   }
 }
 
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph) {
+  NVF_ERROR(!ordered_resize_tensor_ops.empty());
+  Fusion* fusion = ordered_resize_tensor_ops[0]->fusion();
+
+  std::unordered_map<TensorView*, ValGroups> non_exclusive_resizes;
+
+  std::unordered_set<Val*> inputs{
+      fusion->inputs().begin(), fusion->inputs().end()};
+
+  auto get_root_to_logical_resizes =
+      [&exact_graph](TensorView* tv) -> ValGroups {
+    // This should be only used for outputs of resize-based ops,
+    // so it should always have a root domain.
+    NVF_ERROR(tv->hasRoot());
+    auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween(
+        {tv->getRootDomain().begin(), tv->getRootDomain().end()},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    ValGroups resize_inp_ids;
+    for (auto resize :
+         ir_utils::filterByType<Resize>(out_tv_root_to_logical_exprs)) {
+      resize_inp_ids.pushBack(exact_graph.toGroup(resize->in()));
+    }
+    return resize_inp_ids;
+  };
+
+  // Traverse the ops in a topological order
+  for (Expr* resize_tensor_op : ordered_resize_tensor_ops) {
+    auto inp_tv = dynamic_cast<TensorView*>(resize_tensor_op->inputs().at(0));
+    auto out_tv = dynamic_cast<TensorView*>(resize_tensor_op->outputs().at(0));
+
+    ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv);
+    NVF_ERROR(!resize_inp_ids.empty());
+
+    auto dep_vals =
+        DependencyCheck::getAllValsBetween(inputs, std::vector<Val*>{inp_tv});
+
+    // For each tensor that inp_tv depends on, check if the resize op
+    // is considered non-exclusive with respect to the tensor. That
+    // is, if propagation of the resize may result in externally
+    // visible changes through the tensor, the resize is considered
+    // non-exclusive.
+    for (auto dep_tv : ir_utils::filterByType<TensorView>(dep_vals)) {
+      bool maybe_non_exclusive = dep_tv->isFusionOutput();
+
+      if (!maybe_non_exclusive) {
+        // If a dependent tv has a consumer that inp_tv does not
+        // depend on, propagation of resize would escape to outputs,
+        // which needs to be avoided.
+        for (auto consumer_tv : ir_utils::consumerTvsOf(dep_tv)) {
+          // We are interested in if resized IDs are used by other tensors
+          // than out_tv
+          if (consumer_tv != out_tv &&
+              std::find(dep_vals.begin(), dep_vals.end(), consumer_tv) ==
+                  dep_vals.end()) {
+            maybe_non_exclusive = true;
+            break;
+          }
+        }
+      }
+
+      if (!maybe_non_exclusive) {
+        continue;
+      }
+
+      // dep_tv potentially is either a fusion output or it has a
+      // consumer outside of the dependency set to the resized
+      // tensor. Propagating the resize to dep_tv should be
+      // avoided. However, if the dep_tv iter domain that corresponds
+      // to the resized ID is a broadcast or there's no such ID, it
+      // should still be safe to consider the resize op exclusive as
+      // there's no iter domain to resize. For a concrete example, see
+      // ResizeSchedulerTest.PropagateMultipleSlicesToInputs4.
+      const auto inp_tv_logical_groups =
+          exact_graph.toGroups(inp_tv->getLogicalDomain());
+      const auto dep_tv_logical_groups =
+          exact_graph.toGroups(dep_tv->getLogicalDomain());
+      auto vals_between = getValsBetween<ValGraphBFS>(
+          {inp_tv_logical_groups.begin(), inp_tv_logical_groups.end()},
+          {dep_tv_logical_groups.begin(), dep_tv_logical_groups.end()},
+          exact_graph);
+
+      for (const ValGroup& resize_inp_id : resize_inp_ids) {
+        if (std::find(
+                vals_between.begin(), vals_between.end(), resize_inp_id) ==
+            vals_between.end()) {
+          // This resize can be ignored as there's no corresponding ID
+          // in the dep tv
+          continue;
+        }
+
+        // This resize input ID is not exclusively used
+        non_exclusive_resizes[inp_tv].pushBack(resize_inp_id);
+      }
+    }
+
+    // Analysis of exclusiveness until in_tv is done. Following
+    // resize-based tensor ops do not need to check the same section
+    // of the fusion and can start from out_tv.
+    inputs.insert(out_tv);
+  }
+
+  return non_exclusive_resizes;
+}
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/csrc/scheduler/tools/resize_utils.h b/csrc/scheduler/tools/resize_utils.h
@@ -7,9 +7,12 @@
 // clang-format on
 #pragma once
 
+#include <val_graph.h>
+
 namespace nvfuser {
 
 class Expr;
+class TensorView;
 
 namespace scheduler_tools {
 
@@ -19,5 +22,81 @@ namespace scheduler_tools {
 // fusion inputs are skipped as their loop domains don't matter.
 void propagateResizeToInputs(Expr* resize_op);
 
+// Given a topologically ordered list of resize-based tensor ops such
+// as slice and pad, check if they can be propagated to fusion inputs
+// exclusively without causing any visible side effect. For example,
+// if a tensor is sliced and also is used to produce an output without
+// the slicing, the slice is considered non exclusive as the slice
+// input has the other visible consumer. Propagating the resize of the
+// slice to the slice input is invalid since the output computed from
+// the slice input depends on the full iteration space.
+//
+// For example, consider the following case:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t1 + 1
+// fusion.addOutput(t2)
+// fusion.addOutput(t3)
+//
+// In this case, propating the resize op of the slice would alter t1,
+// which would in turn affect t3, which is a fusion output. Since the
+// change would be visible due to the change of t3, this resize op is
+// considered non-exclusive.
+//
+// Consider a slightly different case as shown below:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0[1:10]
+// t2 = t0 + 1
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// Note that the slice is directly done with the fusion input. Since
+// we do not propagate resize ops to fusion inputs, this can be
+// considered exclusive. However, this is also considered
+// non-exclusive since the actual scheduling inserts a cache after t0,
+// which can cause a visible side effect if the resize is propagated.
+//
+// Another non-exclusivness comes from dependent fusion outputs. For
+// example, if a slice input depends on a fusion output, propagation
+// would alter the fusion output. Consider a case like:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10] // slice
+// fusion.addOutput(t1)
+// fusion.addOutput(t2)
+//
+// If the resize op for the slice is propagated to t1, only the
+// section of [1:10] would be computed. Since that would change a
+// fusion output, the resize op is considered non-exclusive.
+//
+// When there's a chain of resize-based ops, for example:
+//
+// t0 = makeSymbolicTensor(1)
+// fusion.addInput(t0)
+// t1 = t0 + 1
+// t2 = t1[1:10]
+// t3 = t2[2:5]
+// t4 = t1 + 1
+// fusion.addOutput(t3)
+// fusion.addOutput(t4)
+//
+// We do not consider the second slice as non-exclusive as
+// long as the first slice is considered non-exclusive. This will be
+// important when resolving the non-exclusiveness by replication.
+//
+// The function returns a map from tensors that are input to
+// non-exclusive ops to their resize input ID groups. This map will be
+// used to resolve the non-exclusiveness by replication.
+std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
+    const std::vector<Expr*>& ordered_resize_tensor_ops,
+    const ValGraph& exact_graph);
+
 } // namespace scheduler_tools
 } // namespace nvfuser
diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -9249,8 +9249,6 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) {
   tv1->split(0, 4);
   tv1->split(0, 8);
 
-  fusion.print();
-
   auto all_ids = tv1->domain()->allIDs();
 
   auto split2 = tv1->axis(0)->definition()->as<Split>();
diff --git a/tests/cpp/test_matmul_scheduler.cpp b/tests/cpp/test_matmul_scheduler.cpp
@@ -1060,7 +1060,7 @@ TEST_F(MatmulSchedulerTest, FusedMultiplySumOnly) {
 //   for Ampere with strict ref check, hence single layout check
 TEST_F(MatmulSchedulerTest, BasicMatmulStrictCheckTT) {
   // TODO: Make these tests work with Hopper as well as Ampere
-  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);
 
   const int M = 128, N = 256, K = 512;
   const auto layout = MmaLayout::TT;
@@ -2481,7 +2481,7 @@ class MatmulSchedulerPluginTest : public NVFuserTest {
 
 // Test that our fake plugin works to override the default heuristic
 TEST_F(MatmulSchedulerPluginTest, BasicMatmul) {
-  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);
   const int M = 128, N = 256, K = 512;
   const auto layout = MmaLayout::TT;
   auto fusion = std::make_unique<Fusion>();
@@ -3156,7 +3156,7 @@ INSTANTIATE_TEST_SUITE_P(
 #undef NVFUSER_TEST_CUDA_ARCH_GUARD
 
 TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) {
-  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
+  NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);
   int M = 32, N = 64, K = 128;
 
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp