Skip to content

Commit

Permalink
[GPU] Add pattern to fuse tensor.extract_slice into forall producer
Browse files Browse the repository at this point in the history
Signed-off-by: Max Dawkins <[email protected]>
  • Loading branch information
Max Dawkins committed Nov 26, 2024
1 parent 4eb7167 commit 2f5056e
Show file tree
Hide file tree
Showing 8 changed files with 409 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,27 @@ struct FuseCollapseShapeConsumers final
}
};

struct FuseExtractSliceConsumers final
: OpRewritePattern<tensor::ExtractSliceOp> {
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(tensor::ExtractSliceOp extractSliceOp,
PatternRewriter &rewriter) const override {
// Find the scf::ForallOp producer, and get the corresponding
// tensor::ParallelInsertSliceOp.
auto forallOp = extractSliceOp.getSource().getDefiningOp<scf::ForallOp>();
if (!forallOp) {
return rewriter.notifyMatchFailure(extractSliceOp,
"No forall op producer");
}

if (failed(fuseExtractSliceIntoProducerForall(rewriter, forallOp,
extractSliceOp))) {
return failure();
}
return success();
}
};

void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
MLIRContext *context = &getContext();

Expand Down Expand Up @@ -385,6 +406,7 @@ void GPUFuseAndHoistParallelLoopsPass::runOnOperation() {
patterns.add<FuseUnitLoopDestination>(context);
patterns.add<FuseTilableForallConsumers>(context);
patterns.add<FuseCollapseShapeConsumers>(context);
patterns.add<FuseExtractSliceConsumers>(context);
tensor::populateFoldTensorEmptyPatterns(patterns);
scf::ForallOp::getCanonicalizationPatterns(patterns, context);
if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,54 @@ void transform_dialect::FuseCollapseShapeWithForallOp::getEffects(
transform::modifiesPayload(effects);
}

//===---------------------------------------------------------------------===//
// FuseExtractSliceWithForallOp
//===---------------------------------------------------------------------===//

DiagnosedSilenceableFailure
transform_dialect::FuseExtractSliceWithForallOp::apply(
transform::TransformRewriter &rewriter,
transform::TransformResults &results, transform::TransformState &state) {
auto producers = state.getPayloadOps(getProducer());
auto consumers = state.getPayloadOps(getConsumer());

int64_t numProducers = llvm::range_size(producers);
int64_t numConsumers = llvm::range_size(consumers);
if (numProducers != 1 || numConsumers != 1) {
return mlir::emitDefiniteFailure(state.getTopLevel(),
"More than one producer or consumer");
}

auto producer = dyn_cast<scf::ForallOp>(*producers.begin());
if (!producer) {
return mlir::emitDefiniteFailure(state.getTopLevel(),
"Non-forall producer");
}
auto consumer = dyn_cast<tensor::ExtractSliceOp>(*consumers.begin());
if (!consumer) {
return mlir::emitDefiniteFailure(state.getTopLevel(),
"Non-extract_slice consumer");
}

FailureOr<scf::ForallOp> fusedForallOp =
GPU::fuseExtractSliceIntoProducerForall(rewriter, producer, consumer);
if (failed(fusedForallOp)) {
return mlir::emitSilenceableFailure(state.getTopLevel(),
"failed to fuse extract_slice op");
}

results.set(getOperation()->getOpResult(0), {fusedForallOp.value()});
return DiagnosedSilenceableFailure::success();
}

void transform_dialect::FuseExtractSliceWithForallOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
transform::consumesHandle(getProducerMutable(), effects);
transform::consumesHandle(getConsumerMutable(), effects);
transform::producesHandle(getOperation()->getOpResults(), effects);
transform::modifiesPayload(effects);
}

} // namespace mlir::iree_compiler::IREE

void mlir::iree_compiler::registerTransformDialectIREEGPUExtension(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,4 +262,38 @@ def FuseCollapseShapeWithForallOp : Op<Transform_Dialect, "iree.fuse_collapse_sh
let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
}

def FuseExtractSliceWithForallOp : Op<Transform_Dialect, "iree.fuse_extract_slice_with_forall",
[FunctionalStyleTransformOpTrait,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
DeclareOpInterfaceMethods<TransformOpInterface>,
ReportTrackingListenerFailuresOpTrait]> {
let description = [{
Fuses a consumer tensor.extract_slice op into a producer scf.forall op.
The users of the block argument for the corresponding forall output operand
should be only a tensor.parallel_insert_slice op, and tensor.extract_slice
ops that extract an equivalent subset. After the fusion, the output of the
forall will be an equal subset slice of the original output, and all users
of this block arg will be clamped to the slice size. Additional tensor.pad
ops will be inserted after any tensor.extract_slice users inside the forall
so that types match. Similarly, a tensor.extract_slice op will be inserted
before the tensor.parallel_insert_slice.

#### Return modes
Emits a definite failure if either the producer is not an scf.forall op or
if the consumer is not a tensor.extract_slice op.
}];

let arguments = (
ins TransformHandleTypeInterface:$producer,
TransformHandleTypeInterface:$consumer
);
let results = (outs TransformHandleTypeInterface:$result);

let assemblyFormat = [{
$consumer `into` $producer attr-dict
`:` functional-type(operands, results)
}];
let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
}

#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TRANSFORMEXTENSIONS_IREEGPUEXTENSIONS
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ iree_lit_test_suite(
"lower_multi_mma.mlir",
"lower_vector_barrier.mlir",
"transform_fuse_collapse_shape_with_forall.mlir",
"transform_fuse_extract_slice_with_forall.mlir",
"transform_fuse_forall.mlir",
"transform_lower_barrier_region.mlir",
"vectorize_iree_gpu_ops.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ iree_lit_test_suite(
"lower_multi_mma.mlir"
"lower_vector_barrier.mlir"
"transform_fuse_collapse_shape_with_forall.mlir"
"transform_fuse_extract_slice_with_forall.mlir"
"transform_fuse_forall.mlir"
"transform_lower_barrier_region.mlir"
"unroll_multi_mma.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule -canonicalize -cse --split-input-file | FileCheck %s

#map = affine_map<(d0) -> (d0 * 2)>
module {
func.func @fuse_extract_slice_with_forall(%arg0: tensor<8xf32>, %arg1: index) -> tensor<?xf32> {
%0 = tensor.empty() : tensor<8xf32>
%1 = scf.forall (%arg2) in (4) shared_outs(%arg3 = %0) -> (tensor<8xf32>) {
%2 = affine.apply #map(%arg2)
%extracted_slice_0 = tensor.extract_slice %arg0[%2] [2] [1] : tensor<8xf32> to tensor<2xf32>
%extracted_slice_1 = tensor.extract_slice %arg3[%2] [2] [1] : tensor<8xf32> to tensor<2xf32>
%3 = linalg.copy ins(%extracted_slice_0 : tensor<2xf32>) outs(%extracted_slice_1 : tensor<2xf32>) -> tensor<2xf32>
scf.forall.in_parallel {
tensor.parallel_insert_slice %3 into %arg3[%2] [2] [1] : tensor<2xf32> into tensor<8xf32>
}
} {mapping = [#gpu.thread<x>]}
%extracted_slice = tensor.extract_slice %1[0] [%arg1] [1] : tensor<8xf32> to tensor<?xf32>
return %extracted_slice : tensor<?xf32>
}
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%producer = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op
%consumer = transform.get_consumers_of_result %producer[0] : (!transform.any_op) -> !transform.any_op
%2 = transform.iree.fuse_extract_slice_with_forall %consumer into %producer
: (!transform.any_op, !transform.any_op) -> !transform.any_op
transform.yield
}
}

// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 2)>
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0)[s0] -> (d0 * -2 + s0, 2)>
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> (0, d0)>
// CHECK-DAG: #[[$MAP3:.+]] = affine_map<(d0) -> (-d0 + 2)>

// CHECK-LABEL: func @fuse_extract_slice_with_forall
// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<8xf32>
// CHECK-SAME: %[[ARG1:[A-Za-z0-9]+]]: index

// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<8xf32>
// CHECK-DAG: %[[SLICED_OUT:.+]] = tensor.extract_slice %[[EMPTY]][0] [%[[ARG1]]] [1] : tensor<8xf32> to tensor<?xf32>
// CHECK: %[[FORALL_RESULT:.+]] = scf.forall (%[[IDX:.+]]) in (4) shared_outs(%[[SLICED_BBARG:.+]] = %[[SLICED_OUT]]) -> (tensor<?xf32>) {
// CHECK-DAG: %[[SLICE_IDX:.+]] = affine.apply #[[$MAP]](%[[IDX]])
// CHECK-DAG: %[[SIZE_CLAMPED_HIGH:.+]] = affine.min #[[$MAP1]](%[[IDX]])[%[[ARG1]]]
// CHECK-DAG: %[[SIZE_CLAMPED_LOW:.+]] = affine.max #[[$MAP2]](%[[SIZE_CLAMPED_HIGH]])
// CHECK-DAG: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[SLICE_IDX]]] [2] [1] : tensor<8xf32> to tensor<2xf32>
// CHECK-DAG: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[SLICED_BBARG]][%[[SLICE_IDX]]] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<?xf32> to tensor<?xf32>
// CHECK-DAG: %[[PAD_HIGH:.+]] = affine.apply #[[$MAP3]](%[[SIZE_CLAMPED_LOW]])
// CHECK: %[[PADDED_OUT_SLICE:.+]] = tensor.pad %[[OUT_SLICE]] low[0] high[%[[PAD_HIGH]]] {
// CHECK: ^bb0({{.*}}):
// CHECK: tensor.yield %[[ZERO]] : f32
// CHECK: } : tensor<?xf32> to tensor<2xf32>
// CHECK: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<2xf32>) outs(%[[PADDED_OUT_SLICE]] : tensor<2xf32>) -> tensor<2xf32>
// CHECK: %[[SLICED_COPY:.+]] = tensor.extract_slice %[[COPY]][0] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<2xf32> to tensor<?xf32>
// CHECK: scf.forall.in_parallel {
// CHECK: tensor.parallel_insert_slice %[[SLICED_COPY]] into %[[SLICED_BBARG]][%[[SLICE_IDX]]] [%[[SIZE_CLAMPED_LOW]]] [1] : tensor<?xf32> into tensor<?xf32>
// CHECK: }
// CHECK: } {mapping = [#gpu.thread<x>]}
// CHECK: return %[[FORALL_RESULT]]
Loading

0 comments on commit 2f5056e

Please sign in to comment.