NVIDIA · jjsjann123 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024 · Dec 24, 2024
diff --git a/csrc/preseg_passes/remove_bcast_squeeze.cpp b/csrc/preseg_passes/remove_bcast_squeeze.cpp
@@ -11,6 +11,7 @@
 #include <multidevice/utils.h>
 #include <ops/alias.h>
 #include <ops/arith.h>
+#include <ops/utils.h>
 #include <options.h>
 #include <preseg_passes/remove_bcast_squeeze.h>
 
@@ -155,6 +156,26 @@ std::vector<bool> nonPreservedDims(const AxisOps& ops) {
   return flags;
 }
 
+TensorView* replayAxisOp(
+    AxisOp simple_op_type,
+    const AxisOps& axis_ops,
+    TensorView* tv) {
+  switch (simple_op_type) {
+    case AxisOp::PRESERVE:
+      // This is equivalent to a set Op
+      return tv;
+      break;
+    case AxisOp::SQUEEZE:
+      return squeeze(tv, nonPreservedDims(axis_ops), true);
+      break;
+    case AxisOp::BROADCAST:
+      return broadcast(tv, nonPreservedDims(axis_ops));
+      break;
+  }
+  NVF_ERROR(false, "unrecognized AxisOp type in replayAxisOp");
+  return nullptr;
+}
+
 //! Given a descriptors of two sequences of broadcast+squeeze ops, return a
 //! descriptor of their composition
 AxisOps composeOps(const AxisOps& prev, const AxisOps& next) {
@@ -318,13 +339,79 @@ TensorView* maybeDoReplacement(TensorView* orig) {
   if (!isReplaceableExpr(second)) {
     return orig;
   }
+  AxisOps second_ops = exprToAxisOps(second);
+
   Expr* first = second->input(0)->definition();
   if (!isReplaceableExpr(first)) {
+    // when second is an axis op, while first is not. We try to swap first and
+    // second. This allows us to opportunistically put two axis ops.
+    // e.g.
+    //   T1 = broadcast(T0)
+    //   T2 = relu(T1)
+    //   T3 = squeeze(T2)
+    // In the iteration where squeeze is `second` and relu is `first`, if we
+    // swap the two operations, we'll ended up with
+    //   T1 = broadcast(T0)
+    //   replayed_T2 = replayed_squeeze(T1)
+    //   replayed_T3 = replayed_relu(replayed_T2)
+    // The following iteration will have an opportunity to merge the broacast
+    // and the replayed_squeeze together.
+    if (auto uop = dynamic_cast<UnaryOp*>(first)) {
+      // replace [unary-op -> second] with:
+      //         [second -> unary-op]
+      // skip if we need to preserve the output from unary-op.
+      if (uop->out()->isFusionOutput() || uop->out()->uses().size() > 1) {
+        return orig;
+      }
+
+      // make sure we preserve the allcoation domain on second->output(0)
+      // initializing alloc_domain permutation of second output.
+      auto second_out_tv = second->output(0)->as<TensorView>();
+      std::optional<std::vector<int64_t>> second_out_allocation_permutation =
+          ir_utils::computePermutation(
+              second_out_tv->getLogicalDomain(),
+              second_out_tv->getMaybeAllocationDomain());
+      // We only support simple permutation, any complex transformation is not
+      // allowed
+      if (!second_out_allocation_permutation.has_value()) {
+        return orig;
+      }
+
+      TensorView* uop_in_tv = uop->in()->as<TensorView>();
+
+      // replay second on unary-op input
+      std::optional<AxisOp> second_op_type_opt =
+          getSimplifiedOpType(second_ops);
+      TensorView* replayed_second_out =
+          replayAxisOp(second_op_type_opt.value(), second_ops, uop_in_tv);
+
+      // replay uop on the replayed second's output
+      Val* replayed_uop_out = ops::newValLike(
+          replayed_second_out, uop->out()->getDataType().value());
+
+      // restore allocation domain on replayed_uop_out
+      auto replayed_uop_out_tv = replayed_uop_out->as<TensorView>();
+      replayed_uop_out_tv->setAllocationDomain(
+          ir_utils::applyPermutation(
+              replayed_uop_out_tv->getLogicalDomain(),
+              second_out_allocation_permutation.value()),
+          true);
+
+      IrBuilder::create<UnaryOp>(
+          uop->getUnaryOpType(), replayed_uop_out, replayed_second_out);
+
+      // replace uses of second output with replayed unary-op out
+      ir_utils::replaceValInAllExprInputsAndFusionOutputs(
+          second->output(0), replayed_uop_out);
+
+      // return replayed_second_out to indicate replacement.
+      return replayed_second_out;
+    }
+    // return orig to indicate no replacement.
     return orig;
   }
-
   AxisOps first_ops = exprToAxisOps(first);
-  AxisOps second_ops = exprToAxisOps(second);
+
   AxisOps simplified_ops = composeOps(first_ops, second_ops);
   std::optional<AxisOp> simple_op_type_opt =
       getSimplifiedOpType(simplified_ops);
@@ -337,18 +424,8 @@ TensorView* maybeDoReplacement(TensorView* orig) {
     replacement = first->output(0)->as<TensorView>();
   } else {
     TensorView* input_tv = first->input(0)->as<TensorView>();
-    switch (simple_op_type_opt.value()) {
-      case AxisOp::PRESERVE:
-        // This is equivalent to a set Op
-        replacement = input_tv;
-        break;
-      case AxisOp::SQUEEZE:
-        replacement = squeeze(input_tv, nonPreservedDims(simplified_ops));
-        break;
-      case AxisOp::BROADCAST:
-        replacement = broadcast(input_tv, nonPreservedDims(simplified_ops));
-        break;
-    }
+    replacement =
+        replayAxisOp(simple_op_type_opt.value(), simplified_ops, input_tv);
   }
   NVF_ERROR(replacement != orig, "Expected non-trivial replacement");
 
@@ -406,6 +483,7 @@ TensorView* maybeDoReplacement(TensorView* orig) {
 
 // Remove broadcast-squeeze and squeeze-broadcast patterns
 void removeBcastSqueeze(Fusion* fusion) {
+  FusionGuard fg(fusion);
   // Iterate from outputs toward producers using a depth-first search for
   // replaceable patterns
   std::vector<TensorView*> stack;

diff --git a/tests/cpp/test_preseg_passes.cpp b/tests/cpp/test_preseg_passes.cpp
@@ -14,6 +14,7 @@
 #include <ops/all_ops.h>
 #include <preseg_passes/optimization_pass.h>
 #include <preseg_passes/pre_segmenter.h>
+#include <preseg_passes/remove_bcast_squeeze.h>
 #include <preseg_passes/translate_repeat_to_expand.h>
 #include <tests/cpp/utils.h>
 #include <tests/cpp/validator.h>
@@ -982,4 +983,154 @@ TEST_F(PresegTest, TranslateRepeatToExpand5) {
   EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::PointWise);
 }
 
+TEST_F(PresegTest, FusionRemoveBroadcastSqueeze0) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({2, 3, 4, 5});
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false, false, false, false});
+  auto tv2 = relu(tv1);
+  auto tv3 = squeeze(tv2, {0});
+  // specify output permutation;
+  std::vector<IterDomain*> tv3_nhwc = {
+      tv3->axis(0), tv3->axis(2), tv3->axis(3), tv3->axis(1)};
+  tv3->setAllocationDomain(tv3_nhwc, true);
+  fusion.addOutput(tv3);
+
+  {
+    // Make sure squeeze/broadcast no longer exists
+    Fusion fusion_copy = fusion;
+    OptimizationPass<RemoveBcastSqueeze>::runPass(&fusion_copy);
+    auto new_exprs = fusion_copy.exprs();
+    EXPECT_EQ(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) {
+              return new_expr->isOneOf<BroadcastOp, SqueezeOp>();
+            }),
+        new_exprs.end());
+  }
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3, 4, 5}, options);
+  std::vector<c10::IValue> inputs = {t0};
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  // validate output permutation is preserved
+  ASSERT_TRUE(outputs[0].is_contiguous(at::MemoryFormat::ChannelsLast));
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+}
+
+TEST_F(PresegTest, FusionRemoveBroadcastSqueeze1) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({1, 3, 4, 5});
+  fusion.addInput(tv0);
+  auto tv1 = reshape(tv0, {1, 3, 4, 5}, {1, 3, 4 * 5});
+  // replay tv1 have rfactor product in IDs.
+  auto tv2 = relu(tv1);
+  auto tv3 = broadcast(tv2, {true, false, false, false});
+  fusion.addOutput(tv3);
+
+  {
+    // broadcast shouldn't be removed
+    Fusion fusion_copy = fusion;
+    OptimizationPass<RemoveBcastSqueeze>::runPass(&fusion_copy);
+    auto new_exprs = fusion_copy.exprs();
+    EXPECT_NE(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) { return new_expr->isA<BroadcastOp>(); }),
+        new_exprs.end());
+  }
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto t0 = at::randn({1, 3, 4, 5}, options);
+  std::vector<c10::IValue> inputs = {t0};
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+}
+
+TEST_F(PresegTest, FusionRemoveBroadcastSqueeze2) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({2, 3, 4, 5});
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false, false, false, false});
+  auto tv2 = relu(tv1);
+  // tv2 is also an output, remove broadcast squeeze pass will not replay the
+  // broadcast
+  fusion.addOutput(tv2);
+  auto tv3 = squeeze(tv2, {0});
+  fusion.addOutput(tv3);
+
+  {
+    // Make sure squeeze/broadcast is not removed from fusion.
+    Fusion fusion_copy = fusion;
+    OptimizationPass<RemoveBcastSqueeze>::runPass(&fusion_copy);
+    auto new_exprs = fusion_copy.exprs();
+    EXPECT_NE(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) {
+              return new_expr->isOneOf<BroadcastOp, SqueezeOp>();
+            }),
+        new_exprs.end());
+  }
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3, 4, 5}, options);
+  std::vector<c10::IValue> inputs = {t0};
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+}
+
+TEST_F(PresegTest, FusionRemoveBroadcastSqueeze3) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigConcreteTensor({2, 3, 4, 5});
+  fusion.addInput(tv0);
+  auto tv1 = broadcast(tv0, {true, false, false, false, false});
+  // tv2 is permuted, we currently do not support swapping permute with axis
+  // ops.
+  auto tv2 = permute(tv1, {{0, 4}});
+  auto tv3 = squeeze(tv2, {4});
+  fusion.addOutput(tv3);
+
+  {
+    // Make sure squeeze/broadcast is not removed from fusion.
+    Fusion fusion_copy = fusion;
+    OptimizationPass<RemoveBcastSqueeze>::runPass(&fusion_copy);
+    auto new_exprs = fusion_copy.exprs();
+    EXPECT_NE(
+        std::find_if(
+            new_exprs.begin(),
+            new_exprs.end(),
+            [](Expr* new_expr) {
+              return new_expr->isOneOf<BroadcastOp, SqueezeOp>();
+            }),
+        new_exprs.end());
+  }
+
+  auto options = at::TensorOptions().device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3, 4, 5}, options);
+  std::vector<c10::IValue> inputs = {t0};
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser::preseg_passes