NVIDIA · protonu · Dec 20, 2024 · Dec 21, 2024 · Dec 23, 2024
diff --git a/csrc/ops/arith.cpp b/csrc/ops/arith.cpp
@@ -494,6 +494,51 @@ TensorView* eye(Val* size, DataType dtype) {
   return eye(size, size, dtype);
 }
 
+TensorView* triu(TensorView* tv, Val* offset) {
+  NVF_CHECK(
+      offset->getDataType() == DataType::Int, "offset must have type Int");
+
+  NVF_CHECK(
+      tv->nDims() >= 2,
+      "triu is only supported for 2+D tensors, but got ",
+      tv->nDims(),
+      "D tensor");
+
+  // Let's say we want a triu of a 2D tensor of shape [2, 4]
+  // We broadcast the iota of the outer dim
+  // [0    [0, 0, 0, 0]
+  // 1] -> [1, 1, 1, 1]
+  // We broadcast the iota of the inner dim
+  // [0, 1, 2, 3] -> [0, 1, 2, 3]
+  //                 [0, 1, 2, 3]
+  // Using LE on the bcast tensors we get the mask
+  //[0, 0, 0, 0]  LE [0, 1, 2, 3]
+  //[1, 1, 1, 1]     [0, 1, 2, 3]
+  // Gives:
+  //[1, 1, 1, 1]
+  //[0, 1, 1, 1]
+  // If triu has an offset of k, we shift/subtract the iota of the columns by k
+  // before broadcasting and comparing with the iota of the rows.
+  auto dims = tv->domain()->logical().size();
+  auto tv_rows = iota(
+      tv->domain()->logical()[dims - 2]->extent(),
+      IrBuilder::create<Val>(0, DataType::Index),
+      IrBuilder::create<Val>(1, DataType::Index),
+      DataType::Int);
+
+  auto tv_columns = iota(
+      tv->domain()->logical()[dims - 1]->extent(),
+      SimplifyingIrBuilder::mulExpr(
+          offset, IrBuilder::create<Val>(-1, DataType::Index)),
+      IrBuilder::create<Val>(1, DataType::Index),
+      DataType::Int);
+
+  auto tv_rows_b = broadcast(tv_rows, {false, true});
+  auto tv_cols_b = broadcast(tv_columns, {true, false});
+  auto mask = le(tv_rows_b, tv_cols_b);
+  return where(mask, tv, IrBuilder::create<Val>(0, tv->dtype()));
+}
+
 // UNARY OPERATIONS
 
 #define NVFUSER_DEFINE_UNARY_OP(operator_name, operator_type) \

diff --git a/csrc/ops/arith.h b/csrc/ops/arith.h
@@ -251,6 +251,7 @@ NVF_API TensorView* arange(
     DataType dtype = DataType::Int);
 NVF_API TensorView* eye(Val* size, DataType dtype);
 NVF_API TensorView* eye(Val* rows, Val* cols, DataType dtype);
+NVF_API TensorView* triu(TensorView* tv, Val* offset);
 
 // UNARY OPERATIONS
 // abs

diff --git a/tests/cpp/test_tensor_factories.cpp b/tests/cpp/test_tensor_factories.cpp
@@ -230,6 +230,35 @@ TEST_F(TensorFactoryTest, StandaloneIota) {
   }
 }
 
+TEST_F(TensorFactoryTest, SimpleTriu) {
+  std::vector<std::vector<int64_t>> input_sizes = {
+      {64, 64}, {4, 16}, {16, 4}, {16, 8, 32}};
+  auto offsets = {0, 1, 2, -1, -2, 200, -200};
+
+  for (auto input_size : input_sizes) {
+    for (auto offset : offsets) {
+      auto fusion = std::make_unique<Fusion>();
+      FusionGuard fg(fusion.get());
+
+      auto tv_to_triu_on =
+          makeSymbolicTensor(input_size.size(), DataType::Half);
+      fusion->addInput(tv_to_triu_on);
+
+      auto out =
+          triu(tv_to_triu_on, IrBuilder::create<Val>(offset, DataType::Int));
+      fusion->addOutput(out);
+
+      auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA);
+      auto in_tensor = at::randn(input_size, options);
+
+      FusionExecutorCache executor_cache(std::move(fusion));
+      auto cg_outputs = executor_cache.runFusionWithInputs({in_tensor});
+
+      EXPECT_TRUE(at::equal(cg_outputs[0], at::triu(in_tensor, offset)));
+    }
+  }
+}
+
 TEST_F(TensorFactoryTest, StandaloneARange) {
   auto starts_ends = {-1., 0., 10.3, 1024. * 256};
   auto steps = {-1.5, 1., 2.};