[GPU] Add pattern to fuse tensor.extract_slice into forall producer

Signed-off-by: Max Dawkins <[email protected]>
iree-org · Nov 26, 2024 · 2f5056e · 2f5056e
1 parent 4eb7167
commit 2f5056e
Show file tree

Hide file tree

Showing 8 changed files with 409 additions and 0 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUFuseAndHoistParallelLoops.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUFuseAndHoistParallelLoops.cpp
@@ -340,6 +340,27 @@ struct FuseCollapseShapeConsumers final
   }
 };
 
+struct FuseExtractSliceConsumers final
+    : OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractSliceOp,
+                                PatternRewriter &rewriter) const override {
+    // Find the scf::ForallOp producer, and get the corresponding
+    // tensor::ParallelInsertSliceOp.
+    auto forallOp = extractSliceOp.getSource().getDefiningOp<scf::ForallOp>();
+    if (!forallOp) {
+      return rewriter.notifyMatchFailure(extractSliceOp,
+                                         "No forall op producer");
+    }
+
+    if (failed(fuseExtractSliceIntoProducerForall(rewriter, forallOp,
+                                                  extractSliceOp))) {
+      return failure();
+    }
+    return success();
+  }
+};
+
 void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
   MLIRContext *context = &getContext();
 
@@ -385,6 +406,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
     patterns.add<FuseUnitLoopDestination>(context);
     patterns.add<FuseTilableForallConsumers>(context);
     patterns.add<FuseCollapseShapeConsumers>(context);
+    patterns.add<FuseExtractSliceConsumers>(context);
     tensor::populateFoldTensorEmptyPatterns(patterns);
     scf::ForallOp::getCanonicalizationPatterns(patterns, context);
     if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
@@ -266,6 +266,54 @@ void transform_dialect::FuseCollapseShapeWithForallOp::getEffects(
   transform::modifiesPayload(effects);
 }
 
+//===---------------------------------------------------------------------===//
+// FuseExtractSliceWithForallOp
+//===---------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform_dialect::FuseExtractSliceWithForallOp::apply(
+    transform::TransformRewriter &rewriter,
+    transform::TransformResults &results, transform::TransformState &state) {
+  auto producers = state.getPayloadOps(getProducer());
+  auto consumers = state.getPayloadOps(getConsumer());
+
+  int64_t numProducers = llvm::range_size(producers);
+  int64_t numConsumers = llvm::range_size(consumers);
+  if (numProducers != 1 || numConsumers != 1) {
+    return mlir::emitDefiniteFailure(state.getTopLevel(),
+                                     "More than one producer or consumer");
+  }
+
+  auto producer = dyn_cast<scf::ForallOp>(*producers.begin());
+  if (!producer) {
+    return mlir::emitDefiniteFailure(state.getTopLevel(),
+                                     "Non-forall producer");
+  }
+  auto consumer = dyn_cast<tensor::ExtractSliceOp>(*consumers.begin());
+  if (!consumer) {
+    return mlir::emitDefiniteFailure(state.getTopLevel(),
+                                     "Non-extract_slice consumer");
+  }
+
+  FailureOr<scf::ForallOp> fusedForallOp =
+      GPU::fuseExtractSliceIntoProducerForall(rewriter, producer, consumer);
+  if (failed(fusedForallOp)) {
+    return mlir::emitSilenceableFailure(state.getTopLevel(),
+                                        "failed to fuse extract_slice op");
+  }
+
+  results.set(getOperation()->getOpResult(0), {fusedForallOp.value()});
+  return DiagnosedSilenceableFailure::success();
+}
+
+void transform_dialect::FuseExtractSliceWithForallOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::consumesHandle(getProducerMutable(), effects);
+  transform::consumesHandle(getConsumerMutable(), effects);
+  transform::producesHandle(getOperation()->getOpResults(), effects);
+  transform::modifiesPayload(effects);
+}
+
 } // namespace mlir::iree_compiler::IREE
 
 void mlir::iree_compiler::registerTransformDialectIREEGPUExtension(

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td
@@ -262,4 +262,38 @@ def FuseCollapseShapeWithForallOp : Op<Transform_Dialect, "iree.fuse_collapse_sh
   let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
 }
 
+def FuseExtractSliceWithForallOp : Op<Transform_Dialect, "iree.fuse_extract_slice_with_forall",
+    [FunctionalStyleTransformOpTrait,
+     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     DeclareOpInterfaceMethods<TransformOpInterface>,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Fuses a consumer tensor.extract_slice op into a producer scf.forall op.
+    The users of the block argument for the corresponding forall output operand
+    should be only a tensor.parallel_insert_slice op, and tensor.extract_slice
+    ops that extract an equivalent subset. After the fusion, the output of the
+    forall will be an equal subset slice of the original output, and all users
+    of this block arg will be clamped to the slice size. Additional tensor.pad
+    ops will be inserted after any tensor.extract_slice users inside the forall
+    so that types match. Similarly, a tensor.extract_slice op will be inserted
+    before the tensor.parallel_insert_slice.
+
+    #### Return modes
+    Emits a definite failure if either the producer is not an scf.forall op or
+    if the consumer is not a tensor.extract_slice op.
+  }];
+
+  let arguments = (
+      ins TransformHandleTypeInterface:$producer,
+          TransformHandleTypeInterface:$consumer
+  );
+  let results = (outs TransformHandleTypeInterface:$result);
+
+  let assemblyFormat = [{
+    $consumer `into` $producer attr-dict
+    `:` functional-type(operands, results)
+  }];
+  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TRANSFORMEXTENSIONS_IREEGPUEXTENSIONS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel
@@ -25,6 +25,7 @@ iree_lit_test_suite(
             "lower_multi_mma.mlir",
             "lower_vector_barrier.mlir",
             "transform_fuse_collapse_shape_with_forall.mlir",
+            "transform_fuse_extract_slice_with_forall.mlir",
             "transform_fuse_forall.mlir",
             "transform_lower_barrier_region.mlir",
             "vectorize_iree_gpu_ops.mlir",

diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt
@@ -21,6 +21,7 @@ iree_lit_test_suite(
     "lower_multi_mma.mlir"
     "lower_vector_barrier.mlir"
     "transform_fuse_collapse_shape_with_forall.mlir"
+    "transform_fuse_extract_slice_with_forall.mlir"
     "transform_fuse_forall.mlir"
     "transform_lower_barrier_region.mlir"
     "unroll_multi_mma.mlir"

diff --git a/...odegen/Dialect/GPU/TransformExtensions/test/transform_fuse_extract_slice_with_forall.mlir b/...odegen/Dialect/GPU/TransformExtensions/test/transform_fuse_extract_slice_with_forall.mlir
@@ -0,0 +1,60 @@
+// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -canonicalize -cse --split-input-file | FileCheck %s
+
+#map = affine_map<(d0) -> (d0 * 2)>
+module {
+  func.func @fuse_extract_slice_with_forall(%arg0: tensor<8xf32>, %arg1: index) -> tensor<?xf32> {
+    %0 = tensor.empty() : tensor<8xf32>
+    %1 = scf.forall (%arg2) in (4) shared_outs(%arg3 = %0) -> (tensor<8xf32>) {
+      %2 = affine.apply #map(%arg2)
+      %extracted_slice_0 = tensor.extract_slice %arg0[%2] [2] [1] : tensor<8xf32> to tensor<2xf32>
+      %extracted_slice_1 = tensor.extract_slice %arg3[%2] [2] [1] : tensor<8xf32> to tensor<2xf32>
+      %3 = linalg.copy ins(%extracted_slice_0 : tensor<2xf32>) outs(%extracted_slice_1 : tensor<2xf32>) -> tensor<2xf32>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %3 into %arg3[%2] [2] [1] : tensor<2xf32> into tensor<8xf32>
+      }
+    } {mapping = [#gpu.thread<x>]}
+    %extracted_slice = tensor.extract_slice %1[0] [%arg1] [1] : tensor<8xf32> to tensor<?xf32>
+    return %extracted_slice : tensor<?xf32>
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %producer = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %consumer = transform.get_consumers_of_result %producer[0] : (!transform.any_op) -> !transform.any_op
+    %2 = transform.iree.fuse_extract_slice_with_forall %consumer into %producer
+      : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 2)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (0, d0)>
+// CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (-d0 + 2)>
+
+// CHECK-LABEL: func @fuse_extract_slice_with_forall
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<8xf32>
+//  CHECK-SAME:   %[[ARG1:[A-Za-z0-9]+]]: index
+
+//   CHECK-DAG:   %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<8xf32>
+//   CHECK-DAG:   %[[SLICED_OUT:.+]] = tensor.extract_slice %[[EMPTY]][0] [%[[ARG1]]] [1] : tensor<8xf32> to tensor<?xf32>
+//       CHECK:   %[[FORALL_RESULT:.+]] = scf.forall (%[[IDX:.+]]) in (4) shared_outs(%[[SLICED_BBARG:.+]] = %[[SLICED_OUT]]) -> (tensor<?xf32>) {
+//   CHECK-DAG:   %[[SLICE_IDX:.+]] = affine.apply #[[$MAP]](%[[IDX]])
+//   CHECK-DAG:   %[[SIZE_CLAMPED_HIGH:.+]] = affine.min #[[$MAP1]](%[[IDX]])[%[[ARG1]]]
+//   CHECK-DAG:   %[[SIZE_CLAMPED_LOW:.+]] = affine.max #[[$MAP2]](%[[SIZE_CLAMPED_HIGH]])
+//   CHECK-DAG:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[SLICE_IDX]]] [2] [1] : tensor<8xf32> to tensor<2xf32>
+//   CHECK-DAG:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[SLICED_BBARG]][%[[SLICE_IDX]]] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<?xf32> to tensor<?xf32>
+//   CHECK-DAG:   %[[PAD_HIGH:.+]] = affine.apply #[[$MAP3]](%[[SIZE_CLAMPED_LOW]])
+//       CHECK:   %[[PADDED_OUT_SLICE:.+]] = tensor.pad %[[OUT_SLICE]] low[0] high[%[[PAD_HIGH]]] {
+//       CHECK:   ^bb0({{.*}}):
+//       CHECK:     tensor.yield %[[ZERO]] : f32
+//       CHECK:   } : tensor<?xf32> to tensor<2xf32>
+//       CHECK:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<2xf32>) outs(%[[PADDED_OUT_SLICE]] : tensor<2xf32>) -> tensor<2xf32>
+//       CHECK:   %[[SLICED_COPY:.+]] = tensor.extract_slice %[[COPY]][0] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<2xf32> to tensor<?xf32>
+//       CHECK:     scf.forall.in_parallel {
+//       CHECK:       tensor.parallel_insert_slice %[[SLICED_COPY]] into %[[SLICED_BBARG]][%[[SLICE_IDX]]] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<?xf32> into tensor<?xf32>
+//       CHECK:     }
+//       CHECK:   } {mapping = [#gpu.thread<x>]}
+//       CHECK:   return %[[FORALL_RESULT]]