|
| 1 | +// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s |
| 2 | + |
| 3 | +#map = affine_map<(d0) -> (d0 * 2)> |
| 4 | +module { |
| 5 | + func.func @fuse_collapse_shape_with_forall(%arg0: tensor<8x8xf32>) -> tensor<64xf32> { |
| 6 | + %0 = tensor.empty() : tensor<8x8xf32> |
| 7 | + %1 = scf.forall (%arg1) in (4) shared_outs(%arg2 = %0) -> (tensor<8x8xf32>) { |
| 8 | + %2 = affine.apply #map(%arg1) |
| 9 | + %extracted_slice = tensor.extract_slice %arg0[%2, 0] [2, 8] [1, 1] : tensor<8x8xf32> to tensor<2x8xf32> |
| 10 | + %extracted_slice_0 = tensor.extract_slice %arg2[%2, 0] [2, 8] [1, 1] : tensor<8x8xf32> to tensor<2x8xf32> |
| 11 | + %3 = linalg.copy ins(%extracted_slice : tensor<2x8xf32>) outs(%extracted_slice_0 : tensor<2x8xf32>) -> tensor<2x8xf32> |
| 12 | + scf.forall.in_parallel { |
| 13 | + tensor.parallel_insert_slice %3 into %arg2[%2, 0] [2, 8] [1, 1] : tensor<2x8xf32> into tensor<8x8xf32> |
| 14 | + } |
| 15 | + } {mapping = [#gpu.thread<x>]} |
| 16 | + %collapsed = tensor.collapse_shape %1 [[0, 1]] : tensor<8x8xf32> into tensor<64xf32> |
| 17 | + return %collapsed : tensor<64xf32> |
| 18 | + } |
| 19 | +} |
| 20 | + |
| 21 | +module attributes {transform.with_named_sequence} { |
| 22 | + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { |
| 23 | + %producer = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op |
| 24 | + %consumer = transform.structured.match ops{["tensor.collapse_shape"]} in %arg0 : (!transform.any_op) -> !transform.any_op |
| 25 | + %2 = transform.iree.fuse_collapse_shape_with_forall %consumer into %producer |
| 26 | + : (!transform.any_op, !transform.any_op) -> !transform.any_op |
| 27 | + transform.yield |
| 28 | + } |
| 29 | +} |
| 30 | + |
| 31 | +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 * 2)> |
| 32 | + |
| 33 | +// CHECK-LABEL: func @fuse_collapse_shape_with_forall |
| 34 | +// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<8x8xf32> |
| 35 | + |
| 36 | +// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x8xf32> |
| 37 | +// CHECK: %[[COLLAPSED_OUT:.+]] = tensor.collapse_shape %[[EMPTY]] {{\[}}[0, 1]] : tensor<8x8xf32> into tensor<64xf32> |
| 38 | +// CHECK: %[[FORALL_RESULT:.+]] = scf.forall (%[[IDX:.+]]) in (4) shared_outs(%[[COLLAPSED_BBARG:.+]] = %[[COLLAPSED_OUT]]) -> (tensor<64xf32>) { |
| 39 | +// CHECK-DAG: %[[SLICE_IDX_0:.+]] = affine.apply #[[$MAP]](%[[IDX]]) |
| 40 | +// CHECK-DAG: %[[SLICE_IDX_1:.+]] = arith.constant 0 : index |
| 41 | +// CHECK: %[[LINEAR_SLICE_IDX:.+]] = affine.linearize_index [%[[SLICE_IDX_0]], %[[SLICE_IDX_1]]] by (8, 8) : index |
| 42 | +// CHECK: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[SLICE_IDX_0]], 0] [2, 8] [1, 1] : tensor<8x8xf32> to tensor<2x8xf32> |
| 43 | +// CHECK: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[COLLAPSED_BBARG]][%[[LINEAR_SLICE_IDX]]] [16] [1] : tensor<64xf32> to tensor<16xf32> |
| 44 | +// CHECK: %[[EXPANDED_OUT_SLICE:.+]] = tensor.expand_shape %[[OUT_SLICE]] {{\[}}[0, 1]] output_shape [2, 8] : tensor<16xf32> into tensor<2x8xf32> |
| 45 | +// CHECK: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<2x8xf32>) outs(%[[EXPANDED_OUT_SLICE]] : tensor<2x8xf32>) -> tensor<2x8xf32> |
| 46 | +// CHECK: %[[COLLAPSED_COPY:.+]] = tensor.collapse_shape %[[COPY]] {{\[}}[0, 1]] : tensor<2x8xf32> into tensor<16xf32> |
| 47 | +// CHECK: scf.forall.in_parallel { |
| 48 | +// CHECK: tensor.parallel_insert_slice %[[COLLAPSED_COPY]] into %[[COLLAPSED_BBARG]][%[[LINEAR_SLICE_IDX]]] [16] [1] : tensor<16xf32> into tensor<64xf32> |
| 49 | +// CHECK: } |
| 50 | +// CHECK: } {mapping = [#gpu.thread<x>]} |
| 51 | +// CHECK: return %[[FORALL_RESULT]] |
0 commit comments