NVIDIA · naoyam · Jan 10, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025
diff --git a/csrc/device_lower/pass/fusion_simplifier.cpp b/csrc/device_lower/pass/fusion_simplifier.cpp
@@ -56,6 +56,16 @@ class LoadStoreOpInserter : private kir::ExprMutator {
             container, LoadStoreOpType::Set, out, in));
   }
 
+  void handle(RepeatOp* op) final {
+    auto out = op->out();
+    auto in = op->in();
+    auto container = out->container();
+    registerReplaceAndPropagate(
+        op,
+        IrBuilder::createInContainer<LoadStoreOp>(
+            container, LoadStoreOpType::Set, out, in));
+  }
+
   void handle(ViewOp* vop) final {
     auto out = vop->out();
     auto in = vop->in();

diff --git a/csrc/device_lower/utils.cpp b/csrc/device_lower/utils.cpp
@@ -162,6 +162,7 @@ bool isTvOp(const Expr* expr) {
           BroadcastOp,
           SqueezeOp,
           ExpandOp,
+          RepeatOp,
           ViewAsScalar,
           ViewOp,
           PadOp,

diff --git a/csrc/dispatch.h b/csrc/dispatch.h
@@ -96,6 +96,7 @@ class Val;
   f(BroadcastOp);                 \
   f(SqueezeOp);                   \
   f(ExpandOp);                    \
+  f(RepeatOp);                    \
   f(ViewAsScalar);                \
   f(ViewOp);                      \
   f(CatOp);                       \

diff --git a/csrc/id_model/predicate_indexing.cpp b/csrc/id_model/predicate_indexing.cpp
@@ -26,7 +26,7 @@ std::vector<IterDomain*> getPredicateDomains(
       : consumer_tv->getLogicalDomain();
 
   // Broadcast domains should not need to be predicated. Note that
-  // unlike indexing for TensorIndex, reduction doamins do need to be
+  // unlike indexing for TensorIndex, reduction domains do need to be
   // indexed to guard the access to the producer tensor
   predicate_domains.erase(
       std::remove_if(

diff --git a/csrc/ir/internal_nodes.h b/csrc/ir/internal_nodes.h
@@ -1527,6 +1527,42 @@ class ExpandOp : public Expr {
       const std::vector<PolymorphicValue>& inputs) const override;
 };
 
+// Represents a repetition of broadcast IDs. Repetitions of
+// non-broadcast IDs are represented using the broadcast, expand and
+// reshape pattern. See the repeat op implementation in ops/alias.cpp
+// as well as the TranslateRepeatToExpand preseg pass.
+class RepeatOp : public Expr {
+ public:
+  using Expr::Expr;
+
+  // in: Input tensor that have broadcast logical IDs.
+  // out: Output tensor where some of the input broadcast logical IDs
+  // are converted to concrete IDs. Their extents represent the
+  // repetition factor of each ID.
+  RepeatOp(IrBuilderPasskey, TensorView* out, TensorView* in);
+
+  NVFUSER_DECLARE_CLONE_AND_CREATE
+
+  const char* getOpString() const override {
+    return "RepeatOp";
+  }
+
+  std::string toString(int indent_size = 0) const override;
+  std::string toInlineString(int indent_size = 0) const override;
+
+  TensorView* out() const {
+    return output(0)->as<TensorView>();
+  }
+
+  TensorView* in() const {
+    return input(0)->as<TensorView>();
+  }
+
+  std::vector<PolymorphicValue> evaluate(
+      const ExpressionEvaluator& ee,
+      const std::vector<PolymorphicValue>& inputs) const override;
+};
+
 class ViewAsScalar : public Expr {
  public:
   using Expr::Expr;

diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
@@ -2135,6 +2135,74 @@ std::vector<PolymorphicValue> ExpandOp::evaluate(
 
 NVFUSER_DEFINE_CLONE_AND_CREATE(ExpandOp)
 
+RepeatOp::RepeatOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in)
+    : Expr(passkey) {
+  auto in_domain = TensorDomain::noReductions(in->getLogicalDomain());
+  const auto& out_domain = out->getLogicalDomain();
+
+  NVF_ERROR(in_domain.size() == out_domain.size());
+
+  bool repetition_found = false;
+  for (const auto i : c10::irange(in_domain.size())) {
+    if (in_domain.at(i)->isBroadcast() && !out_domain.at(i)->isBroadcast()) {
+      NVF_ERROR(!in_domain.at(i)->hasExpandedExtent());
+      NVF_ERROR(in_domain.at(i)->extent()->isOneInt());
+      repetition_found = true;
+    }
+  }
+
+  NVF_ERROR(
+      repetition_found,
+      "No repetition dim found: ",
+      out->toString(),
+      ", ",
+      in->toString());
+
+  addOutput(out);
+  addInput(in);
+}
+
+std::string RepeatOp::toString(int indent_size) const {
+  std::stringstream ss;
+  indent(ss, indent_size) << out()->toString() << " = repeat( " << in()
+                          << " )\n";
+  return ss.str();
+}
+
+std::string RepeatOp::toInlineString(int indent_size) const {
+  NVF_CHECK(false, "Tensor op can not be printed inline");
+}
+
+std::vector<PolymorphicValue> RepeatOp::evaluate(
+    const ExpressionEvaluator& ee,
+    const std::vector<PolymorphicValue>& inputs) const {
+  NVF_ERROR(
+      inputs.size() == 1,
+      "ConcretizeOp expects exactly 1 input, but received ",
+      inputs.size());
+  auto tensor = inputs.at(0).as<at::Tensor>();
+  std::vector<int64_t> sizes;
+  sizes.reserve(out()->getLogicalDomain().size());
+  const auto c2p =
+      PairwiseLogicalDomainMap(in(), out()).mapConsumerToProducer();
+  for (const auto i : c10::irange(out()->getLogicalDomain().size())) {
+    auto out_id = out()->getLogicalDomain().at(i);
+    auto inp_id = c2p.at(out_id);
+    auto out_extent = ee.evaluate(out_id->extent()).as<int64_t>();
+    auto inp_extent = ee.evaluate(inp_id->extent()).as<int64_t>();
+    NVF_ERROR(
+        out_extent == inp_extent || out_extent % inp_extent == 0,
+        "Invalid input and output extents: ",
+        inp_extent,
+        ", ",
+        out_extent);
+    sizes.push_back(out_extent / inp_extent);
+  }
+  return {tensor.repeat(sizes)};
+}
+
+NVFUSER_DEFINE_CLONE_AND_CREATE(RepeatOp)
+
 ViewAsScalar::ViewAsScalar(
     IrBuilderPasskey passkey,
     Val* out,

diff --git a/csrc/logical_domain_map.h b/csrc/logical_domain_map.h
@@ -504,6 +504,10 @@ class ComputeAtLogicalDomainMapBuilder : private BackwardVisitor {
     mapPointwiseLikeOp(op);
   }
 
+  void handle(RepeatOp* op) override {
+    mapPointwiseLikeOp(op);
+  }
+
   void handle(PadOp* op) override {
     // For compute-at, padded id should be mapped
     mapPointwiseLikeOp(op);

diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
@@ -1124,4 +1124,88 @@ TensorView* expand_as(TensorView* inp, TensorView* other) {
   return out_tensor;
 }
 
+TensorView* repeat(TensorView* inp_tv, std::vector<int64_t> repeat_times) {
+  const auto ndims =
+      TensorDomain::noReductions(inp_tv->getLogicalDomain()).size();
+
+  // Handle repetitions of non-broadcast IDs first. Each ID is
+  // individully repeated by:
+  //
+  // Step 1. Insert a broadcast ID immediately outside of the
+  // repeated ID
+  // Step 2. Expand the broadcast ID by the repetition factor
+  // Step 3. Flatten the expanded ID and the repeated ID
+
+  bool has_repetition_of_broadcast = false;
+  auto intermediate_tv = inp_tv;
+  for (const auto i : c10::irange(ndims)) {
+    if (repeat_times.at(i) == 1) {
+      continue;
+    }
+
+    auto inp_id = intermediate_tv->getLogicalDomain().at(i);
+
+    // Broadcast is handled after this
+    if (inp_id->isBroadcast()) {
+      has_repetition_of_broadcast = true;
+      continue;
+    }
+
+    // Step 1: Insert a broadcast ID
+    std::vector<bool> bcast_flags(ndims + 1, false);
+    bcast_flags.at(i) = true;
+    auto broadcast_tv = broadcast(intermediate_tv, bcast_flags);
+
+    // Step 2: Expand the broadcast ID for the repetition factor
+    std::vector<Val*> expanded_sizes(
+        bcast_flags.size(), IrBuilder::create<Val>(-1L));
+    expanded_sizes.at(i) = IrBuilder::create<Val>(repeat_times.at(i));
+    auto expanded_tv = expand(broadcast_tv, expanded_sizes);
+
+    // Step 3: Reshape to merge the broadcast ID and the repeated ID
+    intermediate_tv = flatten(expanded_tv, (int64_t)i, (int64_t)i + 1);
+  }
+
+  if (!has_repetition_of_broadcast) {
+    return intermediate_tv;
+  }
+
+  // Repeat broadcast IDs. The expand approach doesn't work as reshape
+  // would just squeeze repeated IDs and thus there would be no
+  // merge. Expanded IDs would remain to be expanded broadcast IDs. To
+  // concretize them, use RepeatOp
+  std::vector<IterDomain*> new_domain;
+  new_domain.reserve(ndims);
+  std::vector<std::optional<bool>> new_contiguity;
+  new_contiguity.reserve(ndims);
+
+  for (const auto i : c10::irange(ndims)) {
+    auto inp_id = intermediate_tv->getLogicalDomain().at(i);
+    IterDomain* new_id = nullptr;
+
+    if (repeat_times.at(i) > 1 && inp_id->isBroadcast()) {
+      new_id = IterDomainBuilder(inp_id)
+                   .extent(IrBuilder::create<Val>(
+                       repeat_times.at(i), DataType::Index))
+                   .iter_type(IterType::Iteration)
+                   .build();
+    } else {
+      new_id = inp_id->cloneWithoutRFactor();
+    }
+
+    new_domain.push_back(new_id);
+    new_contiguity.push_back(
+        new_id->isBroadcast() ? std::optional<bool>(std::nullopt)
+                              : std::optional<bool>(true));
+  }
+
+  auto out_tv = IrBuilder::create<TensorView>(
+      IrBuilder::create<TensorDomain>(new_domain, new_contiguity),
+      inp_tv->dtype());
+
+  IrBuilder::create<RepeatOp>(out_tv, intermediate_tv);
+
+  return out_tv;
+}
+
 } // namespace nvfuser
diff --git a/csrc/ops/alias.h b/csrc/ops/alias.h
@@ -182,4 +182,9 @@ NVF_API TensorView* expand(
 // non broadcasted iter domain, inp will be expanded to other's size.
 NVF_API TensorView* expand_as(TensorView* inp, TensorView* other);
 
+// Repeat each dimension for a given time. The repeat_times parameter
+// must have the same number of elements as the dimensionality of the
+// input tensor (excluding reduction IDs).
+NVF_API TensorView* repeat(TensorView* inp, std::vector<int64_t> repeat_times);
+
 } // namespace nvfuser
diff --git a/csrc/preseg_passes/translate_repeat_to_expand.cpp b/csrc/preseg_passes/translate_repeat_to_expand.cpp
@@ -124,13 +124,11 @@ class RepeatToExpandTranslator {
     }
   }
 
-  // For each detected repetition:
-  //
-  // Step 1. Insert a broadcast ID immediately outside of the
-  // repeated ID
-  // Step 2. Expand the broadcast ID by the repetition factor
-  // Step 3. Flatten the expanded ID and the repeated ID
+  // For each detected repetition, replace the output with a repeat
+  // output.
   void translate() {
+    FusionGuard fg(fusion_);
+
     const auto exprs = fusion_->exprs();
     // Apply the translation in a reverse topological order. Since the
     // output of the repetition is replaced, the use exprs of the
@@ -145,36 +143,26 @@ class RepeatToExpandTranslator {
 
       const auto& info = repeat_info_map_it->second;
 
-      if (info.cat_inp_tvs.size() < 2) {
+      const auto num_repetitions = (int64_t)info.cat_inp_tvs.size();
+
+      if (num_repetitions < 2) {
         continue;
       }
 
-      auto original_out_tv = expr->output(0)->as<TensorView>();
-
-      // Step 1
-      auto inp_domain =
+      const auto inp_domain =
           TensorDomain::noReductions(info.input_tv->getLogicalDomain());
-      std::vector<bool> bcast_flags(inp_domain.size() + 1, false);
-      auto repeated_id_offset = std::distance(
-          inp_domain.begin(),
-          std::find(inp_domain.begin(), inp_domain.end(), info.repeated_id));
-      bcast_flags.at(repeated_id_offset) = true;
-      auto broadcast_tv = broadcast(info.input_tv, bcast_flags);
-      NVF_ERROR((size_t)broadcast_tv->nDims() == inp_domain.size() + 1);
-
-      // Step 2
-      std::vector<Val*> expanded_sizes(
-          bcast_flags.size(), IrBuilder::create<Val>(-1L));
-      expanded_sizes.at(repeated_id_offset) =
-          IrBuilder::create<Val>((int64_t)info.cat_inp_tvs.size());
-      auto expanded_tv = expand(broadcast_tv, expanded_sizes);
-
-      // Step 3
-      auto flattened_tv =
-          flatten(expanded_tv, repeated_id_offset, repeated_id_offset + 1);
+
+      std::vector<int64_t> repeated_times(inp_domain.size(), 1);
+      auto repeated_id_it =
+          std::find(inp_domain.begin(), inp_domain.end(), info.repeated_id);
+      NVF_ERROR(repeated_id_it != inp_domain.end());
+      auto repeated_dim = std::distance(inp_domain.begin(), repeated_id_it);
+      repeated_times.at(repeated_dim) = num_repetitions;
+
+      TensorView* replacement_tv = repeat(info.input_tv, repeated_times);
 
       ir_utils::replaceValInAllExprInputsAndFusionOutputs(
-          original_out_tv, flattened_tv);
+          expr->output(0), replacement_tv);
     }
   }