Skip to content

Commit e5569a2

Browse files
Soumya ARrsandifo-arm
Soumya AR
andcommitted
aarch64: Use SVE ASRD instruction with Neon modes.
The ASRD instruction on SVE performs an arithmetic shift right by an immediate for divide. This patch enables the use of ASRD with Neon modes. For example: int in[N], out[N]; void foo (void) { for (int i = 0; i < N; i++) out[i] = in[i] / 4; } compiles to: ldr q31, [x1, x0] cmlt v30.16b, v31.16b, #0 and z30.b, z30.b, 3 add v30.16b, v30.16b, v31.16b sshr v30.16b, v30.16b, 2 str q30, [x0, x2] add x0, x0, 16 cmp x0, 1024 but can just be: ldp q30, q31, [x0], 32 asrd z31.b, p7/m, z31.b, rust-lang#2 asrd z30.b, p7/m, z30.b, rust-lang#2 stp q30, q31, [x1], 32 cmp x0, x2 This patch also adds the following overload: aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode) Depending on the data mode, the function returns a predicate with the appropriate bits set. The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_ptrue_reg): New overload. * config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): Likewise. * config/aarch64/aarch64-sve.md: Extended sdiv_pow2<mode>3 and *sdiv_pow2<mode>3 to support Neon modes. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/sve-asrd.c: New test. Co-authored-by: Richard Sandiford <[email protected]> Signed-off-by: Soumya AR <[email protected]>
1 parent 65b7c8d commit e5569a2

File tree

4 files changed

+115
-12
lines changed

4 files changed

+115
-12
lines changed

gcc/config/aarch64/aarch64-protos.h

+1
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,7 @@ void aarch64_expand_mov_immediate (rtx, rtx);
10181018
rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
10191019
rtx aarch64_ptrue_reg (machine_mode);
10201020
rtx aarch64_ptrue_reg (machine_mode, unsigned int);
1021+
rtx aarch64_ptrue_reg (machine_mode, machine_mode);
10211022
rtx aarch64_pfalse_reg (machine_mode);
10221023
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
10231024
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);

gcc/config/aarch64/aarch64-sve.md

+12-12
Original file line numberDiff line numberDiff line change
@@ -5009,34 +5009,34 @@
50095009

50105010
;; Unpredicated ASRD.
50115011
(define_expand "sdiv_pow2<mode>3"
5012-
[(set (match_operand:SVE_I 0 "register_operand")
5013-
(unspec:SVE_I
5012+
[(set (match_operand:SVE_VDQ_I 0 "register_operand")
5013+
(unspec:SVE_VDQ_I
50145014
[(match_dup 3)
5015-
(unspec:SVE_I
5016-
[(match_operand:SVE_I 1 "register_operand")
5015+
(unspec:SVE_VDQ_I
5016+
[(match_operand:SVE_VDQ_I 1 "register_operand")
50175017
(match_operand 2 "aarch64_simd_rshift_imm")]
50185018
UNSPEC_ASRD)]
50195019
UNSPEC_PRED_X))]
50205020
"TARGET_SVE"
50215021
{
5022-
operands[3] = aarch64_ptrue_reg (<VPRED>mode);
5022+
operands[3] = aarch64_ptrue_reg (<VPRED>mode, <MODE>mode);
50235023
}
50245024
)
50255025

50265026
;; Predicated ASRD.
50275027
(define_insn "*sdiv_pow2<mode>3"
5028-
[(set (match_operand:SVE_I 0 "register_operand")
5029-
(unspec:SVE_I
5028+
[(set (match_operand:SVE_VDQ_I 0 "register_operand")
5029+
(unspec:SVE_VDQ_I
50305030
[(match_operand:<VPRED> 1 "register_operand")
5031-
(unspec:SVE_I
5032-
[(match_operand:SVE_I 2 "register_operand")
5033-
(match_operand:SVE_I 3 "aarch64_simd_rshift_imm")]
5031+
(unspec:SVE_VDQ_I
5032+
[(match_operand:SVE_VDQ_I 2 "register_operand")
5033+
(match_operand:SVE_VDQ_I 3 "aarch64_simd_rshift_imm")]
50345034
UNSPEC_ASRD)]
50355035
UNSPEC_PRED_X))]
50365036
"TARGET_SVE"
50375037
{@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
5038-
[ w , Upl , 0 ; * ] asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
5039-
[ ?&w , Upl , w ; yes ] movprfx\t%0, %2\;asrd\t%0.<Vetype>, %1/m, %0.<Vetype>, #%3
5038+
[ w , Upl , 0 ; * ] asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
5039+
[ ?&w , Upl , w ; yes ] movprfx\t%Z0, %Z2\;asrd\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, #%3
50405040
}
50415041
)
50425042

gcc/config/aarch64/aarch64.cc

+16
Original file line numberDiff line numberDiff line change
@@ -3778,6 +3778,22 @@ aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
37783778
return gen_lowpart (mode, reg);
37793779
}
37803780

3781+
/* Return a register of mode PRED_MODE for controlling data of mode DATA_MODE.
3782+
3783+
DATA_MODE can be a scalar, an Advanced SIMD vector, or an SVE vector.
3784+
If it's an N-byte scalar or an Advanced SIMD vector, the first N bits
3785+
of the predicate will be active and the rest will be inactive.
3786+
If DATA_MODE is an SVE mode, every bit of the predicate will be active. */
3787+
rtx
3788+
aarch64_ptrue_reg (machine_mode pred_mode, machine_mode data_mode)
3789+
{
3790+
if (aarch64_sve_mode_p (data_mode))
3791+
return aarch64_ptrue_reg (pred_mode);
3792+
3793+
auto size = GET_MODE_SIZE (data_mode).to_constant ();
3794+
return aarch64_ptrue_reg (pred_mode, size);
3795+
}
3796+
37813797
/* Return an all-false predicate register of mode MODE. */
37823798

37833799
rtx
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-Ofast --param aarch64-autovec-preference=asimd-only" } */
3+
/* { dg-final { check-function-bodies "**" "" "" } } */
4+
5+
#include <stdint.h>
6+
7+
#define FUNC(TYPE, I) \
8+
TYPE M_##TYPE##_##I[I]; \
9+
void asrd_##TYPE##_##I () \
10+
{ \
11+
for (int i = 0; i < I; i++) \
12+
{ \
13+
M_##TYPE##_##I[i] /= 4; \
14+
} \
15+
}
16+
17+
/*
18+
** asrd_int8_t_8:
19+
** ...
20+
** ptrue (p[0-7]).b, vl8
21+
** ...
22+
** asrd z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
23+
** ...
24+
*/
25+
FUNC(int8_t, 8);
26+
27+
/*
28+
** asrd_int8_t_16:
29+
** ...
30+
** ptrue (p[0-7]).b, vl16
31+
** ...
32+
** asrd z[0-9]+\.b, \1/m, z[0-9]+\.b, #2
33+
** ...
34+
*/
35+
FUNC(int8_t, 16);
36+
37+
/*
38+
** asrd_int16_t_4:
39+
** ...
40+
** ptrue (p[0-7]).b, vl8
41+
** ...
42+
** asrd z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
43+
** ...
44+
*/
45+
FUNC(int16_t, 4);
46+
47+
/*
48+
** asrd_int16_t_8:
49+
** ...
50+
** ptrue (p[0-7]).b, vl16
51+
** ...
52+
** asrd z[0-9]+\.h, \1/m, z[0-9]+\.h, #2
53+
** ...
54+
*/
55+
FUNC(int16_t, 8);
56+
57+
/*
58+
** asrd_int32_t_2:
59+
** ...
60+
** ptrue (p[0-7]).b, vl8
61+
** ...
62+
** asrd z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
63+
** ...
64+
*/
65+
FUNC(int32_t, 2);
66+
67+
/*
68+
** asrd_int32_t_4:
69+
** ...
70+
** ptrue (p[0-7]).b, vl16
71+
** ...
72+
** asrd z[0-9]+\.s, \1/m, z[0-9]+\.s, #2
73+
** ...
74+
*/
75+
FUNC(int32_t, 4);
76+
77+
/*
78+
** asrd_int64_t_2:
79+
** ...
80+
** ptrue (p[0-7]).b, vl16
81+
** ...
82+
** asrd z[0-9]+\.d, \1/m, z[0-9]+\.d, #2
83+
** ...
84+
*/
85+
FUNC(int64_t, 2);
86+

0 commit comments

Comments
 (0)