Skip to content

Commit 3f87141

Browse files
mdcornutkanteck
authored andcommitted
erasure_code: optimize AVX2 GFNI single vector dot product
Signed-off-by: Marcel Cornu <[email protected]>
1 parent 164d9ff commit 3f87141

File tree

1 file changed

+166
-64
lines changed

1 file changed

+166
-64
lines changed

erasure_code/gf_vect_dot_prod_avx2_gfni.asm

Lines changed: 166 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,17 @@
4747

4848
%define tmp r11
4949
%define tmp2 r10
50-
%define tmp3 r12 ;must be saved and restored
51-
%define tmp4 r13 ;must be saved and restored
50+
%define tmp3 r12 ; must be saved and restored
5251

52+
%define stack_size 1*8
5353
%define func(x) x: endbranch
5454
%macro FUNC_SAVE 0
55-
push r12
56-
push r13
55+
sub rsp, stack_size
56+
mov [rsp + 0*8], r12
5757
%endmacro
5858
%macro FUNC_RESTORE 0
59-
pop r13
60-
pop r12
59+
mov r12, [rsp + 0*8]
60+
add rsp, stack_size
6161
%endmacro
6262
%endif
6363

@@ -67,32 +67,37 @@
6767
%define arg2 r8
6868
%define arg3 r9
6969

70-
%define arg4 r12 ; must be saved, loaded and restored
71-
%define arg5 r13 ; must be saved and restored
70+
%define arg4 r12 ; must be saved, loaded and restored
71+
%define arg5 r15 ; must be saved and restored
7272
%define tmp r11
7373
%define tmp2 r10
74-
%define tmp3 r14
75-
%define tmp4 r15
76-
%define stack_size 0*16 + 5*8 ; must be an odd multiple of 8
74+
%define tmp3 r13 ; must be saved and restored
75+
%define stack_size 4*16 + 3*8 ; must be an odd multiple of 8
7776
%define arg(x) [rsp + stack_size + 8 + 8*x]
7877

7978
%define func(x) proc_frame x
8079
%macro FUNC_SAVE 0
81-
alloc_stack stack_size
82-
mov [rsp + 0*8], r12
83-
mov [rsp + 1*8], r13
84-
mov [rsp + 2*8], r14
85-
mov [rsp + 3*8], r15
86-
end_prolog
87-
mov arg4, arg(4)
80+
alloc_stack stack_size
81+
vmovdqa [rsp + 0*16], xmm6
82+
vmovdqa [rsp + 1*16], xmm7
83+
vmovdqa [rsp + 2*16], xmm8
84+
vmovdqa [rsp + 3*16], xmm9
85+
mov [rsp + 4*16 + 0*8], r12
86+
mov [rsp + 4*16 + 1*8], r13
87+
mov [rsp + 4*16 + 2*8], r15
88+
end_prolog
89+
mov arg4, arg(4)
8890
%endmacro
8991

9092
%macro FUNC_RESTORE 0
91-
mov r12, [rsp + 0*8]
92-
mov r13, [rsp + 1*8]
93-
mov r14, [rsp + 2*8]
94-
mov r15, [rsp + 3*8]
95-
add rsp, stack_size
93+
vmovdqa xmm6, [rsp + 0*16]
94+
vmovdqa xmm7, [rsp + 1*16]
95+
vmovdqa xmm8, [rsp + 2*16]
96+
vmovdqa xmm9, [rsp + 3*16]
97+
mov r12, [rsp + 4*16 + 0*8]
98+
mov r13, [rsp + 4*16 + 1*8]
99+
mov r15, [rsp + 4*16 + 2*8]
100+
add rsp, stack_size
96101
%endmacro
97102
%endif
98103

@@ -106,7 +111,6 @@
106111
%define vec_i tmp2
107112
%define pos rax
108113

109-
110114
%ifndef EC_ALIGNED_ADDR
111115
;;; Use Un-aligned load/store
112116
%define XLDR vmovdqu
@@ -122,95 +126,193 @@
122126
%endif
123127
%endif
124128

125-
%define xgft1 ymm2
129+
%define x0l ymm0
130+
%define x0h ymm1
131+
%define x0x ymm2
132+
133+
%define xp1l ymm3
134+
%define xp1h ymm4
135+
%define xp1x ymm5
136+
137+
%define xgft1 ymm6
138+
%define xgft2 ymm7
139+
%define xgft3 ymm8
126140

127-
%define x0 ymm0
128-
%define xp1 ymm1
141+
%define xtmp1 ymm9
142+
143+
%define x0 x0l
144+
%define xp1 xp1l
145+
%define xp2 xp2l
146+
%define xp3 xp3l
129147

130148
default rel
131149
[bits 64]
150+
132151
section .text
133152

134153
;;
135-
;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disk)
154+
;; Encodes 96 bytes of all "k" sources into 96 bytes (single parity disk)
155+
;;
156+
%macro ENCODE_96B 0
157+
vpxor xp1l, xp1l, xp1l
158+
vpxor xp1h, xp1h, xp1h
159+
vpxor xp1x, xp1x, xp1x
160+
mov tmp, mul_array
161+
xor vec_i, vec_i
162+
163+
%%next_vect:
164+
;; load next source vector
165+
mov ptr, [src + vec_i]
166+
XLDR x0l, [ptr + pos]
167+
XLDR x0h, [ptr + pos + 32]
168+
XLDR x0x, [ptr + pos + 64]
169+
add vec_i, 8
170+
171+
vbroadcastsd xgft1, [tmp]
172+
add tmp, 8
173+
174+
GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l
175+
GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h
176+
GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x
177+
178+
cmp vec_i, vec
179+
jl %%next_vect
180+
181+
XSTR [dest1 + pos], xp1l
182+
XSTR [dest1 + pos + 32], xp1h
183+
XSTR [dest1 + pos + 64], xp1x
184+
%endmacro
185+
186+
;;
187+
;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk)
188+
;;
189+
%macro ENCODE_64B 0
190+
vpxor xp1l, xp1l, xp1l
191+
vpxor xp1h, xp1h, xp1h
192+
mov tmp, mul_array
193+
xor vec_i, vec_i
194+
195+
%%next_vect:
196+
;; load next source vector
197+
mov ptr, [src + vec_i]
198+
XLDR x0l, [ptr + pos]
199+
XLDR x0h, [ptr + pos + 32]
200+
add vec_i, 8
201+
202+
vbroadcastsd xgft1, [tmp]
203+
add tmp, 8
204+
205+
GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l
206+
GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h
207+
208+
cmp vec_i, vec
209+
jl %%next_vect
210+
211+
XSTR [dest1 + pos], xp1l
212+
XSTR [dest1 + pos + 32], xp1h
213+
%endmacro
214+
215+
;;
216+
;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disks)
136217
;;
137218
%macro ENCODE_32B 0
138-
vpxor xp1, xp1, xp1
139-
mov tmp, mul_array
140-
xor vec_i, vec_i
219+
vpxor xp1, xp1, xp1
220+
mov tmp, mul_array
221+
xor vec_i, vec_i
141222

142223
%%next_vect:
143-
mov ptr, [src + vec_i]
144-
XLDR x0, [ptr + pos] ;Get next source vector (32 bytes)
145-
add vec_i, 8
224+
;; load next source vector
225+
mov ptr, [src + vec_i]
226+
XLDR x0, [ptr + pos]
227+
add vec_i, 8
146228

147229
vbroadcastsd xgft1, [tmp]
148-
add tmp, 8
230+
add tmp, 8
149231

150232
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1
151233

152-
cmp vec_i, vec
153-
jl %%next_vect
234+
cmp vec_i, vec
235+
jl %%next_vect
154236

155-
XSTR [dest1 + pos], xp1
237+
XSTR [dest1 + pos], xp1
156238
%endmacro
157239

158240
;;
159-
;; Encodes less than 32 bytes of all "k" sources and updates single parity disk
241+
;; Encodes less than 32 bytes of all "k" sources into single parity disks
160242
;;
161243
%macro ENCODE_LT_32B 1
162-
%define %%LEN %1
244+
%define %%LEN %1
163245

164-
vpxor xp1, xp1, xp1
165-
mov tmp, mul_array
166-
xor vec_i, vec_i
246+
vpxor xp1, xp1, xp1
247+
xor vec_i, vec_i
167248

168249
%%next_vect:
169-
mov ptr, [src + vec_i]
170-
simd_load_avx2 x0, ptr + pos, %%LEN, tmp3, tmp4 ;Get next source vector
171-
add vec_i, 8
250+
; get next source vector
251+
mov ptr, [src + vec_i]
252+
simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp3
253+
add vec_i, 8
172254

173-
vbroadcastsd xgft1, [tmp]
174-
add tmp, 8
255+
vbroadcastsd xgft1, [mul_array]
256+
add mul_array, 8
175257

176258
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1
177259

178-
cmp vec_i, vec
179-
jl %%next_vect
260+
cmp vec_i, vec
261+
jl %%next_vect
180262

181-
lea tmp, [dest1 + pos]
182-
simd_store_avx2 tmp, xp1, %%LEN, tmp3, tmp4 ;Store updated encoded data
263+
;; Store updated encoded data
264+
lea ptr, [dest1 + pos]
265+
simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i
183266
%endmacro
184267

185268
align 16
186269
mk_global gf_vect_dot_prod_avx2_gfni, function
187270
func(gf_vect_dot_prod_avx2_gfni)
188-
FUNC_SAVE
189-
xor pos, pos
190-
shl vec, 3 ;vec *= 8. Make vec_i count by 8
271+
FUNC_SAVE
272+
273+
xor pos, pos
274+
shl vec, 3 ;; vec *= 8. Make vec_i count by 8
275+
276+
cmp len, 96
277+
jl .len_lt_96
278+
279+
.loop96:
280+
ENCODE_96B
191281

192-
cmp len, 32
193-
jb .len_lt_32
282+
add pos, 96 ;; Loop on 96 bytes at a time first
283+
sub len, 96
284+
cmp len, 96
285+
jge .loop96
194286

195-
.loop32:
287+
.len_lt_96:
288+
cmp len, 64
289+
jl .len_lt_64
290+
291+
ENCODE_64B
292+
293+
add pos, 64 ;; encode next 64 bytes
294+
sub len, 64
295+
296+
.len_lt_64:
297+
cmp len, 32
298+
jl .len_lt_32
196299

197300
ENCODE_32B
198301

199-
add pos, 32 ;Loop on 32 bytes at a time
302+
add pos, 32 ;; encode next 32 bytes
200303
sub len, 32
201-
cmp len, 32
202-
jge .loop32
203304

204305
.len_lt_32:
205306
cmp len, 0
206307
jle .exit
207308

208-
ENCODE_LT_32B len
309+
ENCODE_LT_32B len ;; encode final bytes
310+
209311
.exit:
210312
vzeroupper
211313

212-
FUNC_RESTORE
213-
ret
314+
FUNC_RESTORE
315+
ret
214316

215317
endproc_frame
216318
%endif ; if AS_FEATURE_LEVEL >= 10

0 commit comments

Comments
 (0)