Skip to content

Commit 164d9ff

Browse files
mdcornutkanteck
authored andcommitted
erasure_code: add 2 vector AVX2 dot product with GFNI implementation
Signed-off-by: Marcel Cornu <[email protected]>
1 parent f827464 commit 164d9ff

File tree

4 files changed

+314
-5
lines changed

4 files changed

+314
-5
lines changed

Makefile.nmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ objs = \
9393
bin\gf_6vect_dot_prod_avx512.obj \
9494
bin\gf_vect_dot_prod_avx512_gfni.obj \
9595
bin\gf_vect_dot_prod_avx2_gfni.obj \
96+
bin\gf_2vect_dot_prod_avx2_gfni.obj \
9697
bin\gf_3vect_dot_prod_avx2_gfni.obj \
9798
bin\gf_2vect_dot_prod_avx512_gfni.obj \
9899
bin\gf_3vect_dot_prod_avx512_gfni.obj \

erasure_code/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ lsrc_x86_64 += \
8686
erasure_code/gf_6vect_dot_prod_avx512.asm \
8787
erasure_code/gf_vect_dot_prod_avx512_gfni.asm \
8888
erasure_code/gf_vect_dot_prod_avx2_gfni.asm \
89+
erasure_code/gf_2vect_dot_prod_avx2_gfni.asm \
8990
erasure_code/gf_3vect_dot_prod_avx2_gfni.asm \
9091
erasure_code/gf_2vect_dot_prod_avx512_gfni.asm \
9192
erasure_code/gf_3vect_dot_prod_avx512_gfni.asm \

erasure_code/ec_highlevel_func.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,8 @@ extern void gf_6vect_mad_avx512_gfni(int len, int vec, int vec_i, unsigned char
269269

270270
extern void gf_vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
271271
unsigned char **data, unsigned char *dest);
272+
extern void gf_2vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
273+
unsigned char **data, unsigned char **coding);
272274
extern void gf_3vect_dot_prod_avx2_gfni(int len, int k, unsigned char *g_tbls,
273275
unsigned char **data, unsigned char **coding);
274276
extern void gf_vect_mad_avx2_gfni(int len, int vec, int vec_i, unsigned char *gftbls,
@@ -327,13 +329,17 @@ void ec_encode_data_avx2_gfni(int len, int k, int rows, unsigned char *g_tbls,
327329
coding += 3;
328330
rows -= 3;
329331
}
330-
while (rows) {
332+
switch (rows) {
333+
case 2:
334+
gf_2vect_dot_prod_avx2_gfni(len, k, g_tbls, data, coding);
335+
break;
336+
case 1:
331337
gf_vect_dot_prod_avx2_gfni(len, k, g_tbls, data, *coding);
332-
g_tbls += k * 8;
333-
coding++;
334-
rows--;
338+
break;
339+
case 0:
340+
default:
341+
break;
335342
}
336-
337343
}
338344

339345
void ec_encode_data_update_avx512_gfni(int len, int k, int rows, int vec_i,
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2+
; Copyright(c) 2023 Intel Corporation All rights reserved.
3+
;
4+
; Redistribution and use in source and binary forms, with or without
5+
; modification, are permitted provided that the following conditions
6+
; are met:
7+
; * Redistributions of source code must retain the above copyright
8+
; notice, this list of conditions and the following disclaimer.
9+
; * Redistributions in binary form must reproduce the above copyright
10+
; notice, this list of conditions and the following disclaimer in
11+
; the documentation and/or other materials provided with the
12+
; distribution.
13+
; * Neither the name of Intel Corporation nor the names of its
14+
; contributors may be used to endorse or promote products derived
15+
; from this software without specific prior written permission.
16+
;
17+
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18+
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19+
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20+
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21+
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22+
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23+
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24+
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25+
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26+
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29+
30+
;;;
31+
;;; gf_2vect_dot_prod_avx2_gfni(len, vec, *g_tbls, **buffs, **dests);
32+
;;;
33+
34+
%include "reg_sizes.asm"
35+
%include "gf_vect_gfni.inc"
36+
%include "memcpy.asm"
37+
38+
%if AS_FEATURE_LEVEL >= 10
39+
40+
%ifidn __OUTPUT_FORMAT__, elf64
41+
%define arg0 rdi
42+
%define arg1 rsi
43+
%define arg2 rdx
44+
%define arg3 rcx
45+
%define arg4 r8
46+
%define arg5 r9
47+
48+
%define tmp r11
49+
%define tmp2 r10
50+
%define tmp3 r13 ; must be saved and restored
51+
%define tmp4 r12 ; must be saved and restored
52+
%define tmp5 r14 ; must be saved and restored
53+
54+
%define stack_size 3*8
55+
%define func(x) x: endbranch
56+
%macro FUNC_SAVE 0
57+
sub rsp, stack_size
58+
mov [rsp + 0*8], r12
59+
mov [rsp + 1*8], r13
60+
mov [rsp + 2*8], r14
61+
%endmacro
62+
%macro FUNC_RESTORE 0
63+
mov r12, [rsp + 0*8]
64+
mov r13, [rsp + 1*8]
65+
mov r14, [rsp + 2*8]
66+
add rsp, stack_size
67+
%endmacro
68+
%endif
69+
70+
%ifidn __OUTPUT_FORMAT__, win64
71+
%define arg0 rcx
72+
%define arg1 rdx
73+
%define arg2 r8
74+
%define arg3 r9
75+
76+
%define arg4 r12 ; must be saved, loaded and restored
77+
%define arg5 r15 ; must be saved and restored
78+
%define tmp r11
79+
%define tmp2 r10
80+
%define tmp3 r13 ; must be saved and restored
81+
%define tmp4 r14 ; must be saved and restored
82+
%define tmp5 rdi ; must be saved and restored
83+
%define stack_size 4*16 + 5*8 ; must be an odd multiple of 8
84+
%define arg(x) [rsp + stack_size + 8 + 8*x]
85+
86+
%define func(x) proc_frame x
87+
%macro FUNC_SAVE 0
88+
alloc_stack stack_size
89+
vmovdqa [rsp + 0*16], xmm6
90+
vmovdqa [rsp + 1*16], xmm7
91+
vmovdqa [rsp + 2*16], xmm8
92+
vmovdqa [rsp + 3*16], xmm9
93+
mov [rsp + 4*16 + 0*8], r12
94+
mov [rsp + 4*16 + 1*8], r13
95+
mov [rsp + 4*16 + 2*8], r14
96+
mov [rsp + 4*16 + 3*8], r15
97+
mov [rsp + 4*16 + 4*8], rdi
98+
end_prolog
99+
mov arg4, arg(4)
100+
%endmacro
101+
102+
%macro FUNC_RESTORE 0
103+
vmovdqa xmm6, [rsp + 0*16]
104+
vmovdqa xmm7, [rsp + 1*16]
105+
vmovdqa xmm8, [rsp + 2*16]
106+
vmovdqa xmm9, [rsp + 3*16]
107+
mov r12, [rsp + 4*16 + 0*8]
108+
mov r13, [rsp + 4*16 + 1*8]
109+
mov r14, [rsp + 4*16 + 2*8]
110+
mov r15, [rsp + 4*16 + 3*8]
111+
mov rdi, [rsp + 4*16 + 4*8]
112+
add rsp, stack_size
113+
%endmacro
114+
%endif
115+
116+
117+
%define len arg0
118+
%define vec arg1
119+
%define mul_array arg2
120+
%define src arg3
121+
%define dest arg4
122+
%define ptr arg5
123+
%define vec_i tmp2
124+
%define dest2 tmp3
125+
%define dest1 tmp5
126+
%define pos rax
127+
128+
%ifndef EC_ALIGNED_ADDR
129+
;;; Use Un-aligned load/store
130+
%define XLDR vmovdqu
131+
%define XSTR vmovdqu
132+
%else
133+
;;; Use Non-temporal load/stor
134+
%ifdef NO_NT_LDST
135+
%define XLDR vmovdqa
136+
%define XSTR vmovdqa
137+
%else
138+
%define XLDR vmovntdqa
139+
%define XSTR vmovntdq
140+
%endif
141+
%endif
142+
143+
%define x0l ymm0
144+
%define x0h ymm1
145+
146+
%define xgft1 ymm2
147+
%define xgft2 ymm3
148+
149+
%define xtmp1 ymm4
150+
%define xtmp2 ymm5
151+
152+
%define xp1l ymm6
153+
%define xp2l ymm7
154+
%define xp1h ymm8
155+
%define xp2h ymm9
156+
157+
%define x0 x0l
158+
%define xp1 xp1l
159+
%define xp2 xp2l
160+
161+
default rel
162+
[bits 64]
163+
164+
section .text
165+
166+
;;
167+
;; Encodes 64 bytes of all "k" sources into 2x 64 bytes (parity disks)
168+
;;
169+
%macro ENCODE_64B_2 0
170+
vpxor xp1l, xp1l, xp1l
171+
vpxor xp1h, xp1h, xp1h
172+
vpxor xp2l, xp2l, xp2l
173+
vpxor xp2h, xp2h, xp2h
174+
mov tmp, mul_array
175+
xor vec_i, vec_i
176+
177+
%%next_vect:
178+
mov ptr, [src + vec_i]
179+
XLDR x0l, [ptr + pos] ;; Get next source vector low 32 bytes
180+
XLDR x0h, [ptr + pos + 32] ;; Get next source vector high 32 bytes
181+
add vec_i, 8
182+
183+
vbroadcastsd xgft1, [tmp]
184+
vbroadcastsd xgft2, [tmp + vec]
185+
add tmp, 8
186+
187+
GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l, xgft2, xtmp2, xp2l
188+
GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h, xgft2, xgft2, xp2h
189+
190+
cmp vec_i, vec
191+
jl %%next_vect
192+
193+
XSTR [dest1 + pos], xp1l
194+
XSTR [dest1 + pos + 32], xp1h
195+
XSTR [dest2 + pos], xp2l
196+
XSTR [dest2 + pos + 32], xp2h
197+
%endmacro
198+
199+
;;
200+
;; Encodes 32 bytes of all "k" sources into 2x 32 bytes (parity disks)
201+
;;
202+
%macro ENCODE_32B_2 0
203+
vpxor xp1, xp1, xp1
204+
vpxor xp2, xp2, xp2
205+
mov tmp, mul_array
206+
xor vec_i, vec_i
207+
208+
%%next_vect:
209+
mov ptr, [src + vec_i]
210+
XLDR x0, [ptr + pos] ;Get next source vector (32 bytes)
211+
add vec_i, 8
212+
213+
vbroadcastsd xgft1, [tmp]
214+
vbroadcastsd xgft2, [tmp + vec]
215+
add tmp, 8
216+
217+
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
218+
219+
cmp vec_i, vec
220+
jl %%next_vect
221+
222+
XSTR [dest1 + pos], xp1
223+
XSTR [dest2 + pos], xp2
224+
%endmacro
225+
226+
;;
227+
;; Encodes less than 32 bytes of all "k" sources into 2 parity disks
228+
;;
229+
%macro ENCODE_LT_32B_2 1
230+
%define %%LEN %1
231+
232+
vpxor xp1, xp1, xp1
233+
vpxor xp2, xp2, xp2
234+
xor vec_i, vec_i
235+
236+
%%next_vect:
237+
mov ptr, [src + vec_i]
238+
simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp4 ;Get next source vector
239+
add vec_i, 8
240+
241+
vbroadcastsd xgft1, [mul_array]
242+
vbroadcastsd xgft2, [mul_array + vec]
243+
add mul_array, 8
244+
245+
GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1, xgft2, xgft2, xp2
246+
247+
cmp vec_i, vec
248+
jl %%next_vect
249+
250+
;Store updated encoded data
251+
lea ptr, [dest1 + pos]
252+
simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i
253+
254+
lea ptr, [dest2 + pos]
255+
simd_store_avx2 ptr, xp2, %%LEN, tmp, vec_i
256+
%endmacro
257+
258+
align 16
259+
mk_global gf_2vect_dot_prod_avx2_gfni, function
260+
func(gf_2vect_dot_prod_avx2_gfni)
261+
FUNC_SAVE
262+
263+
xor pos, pos
264+
shl vec, 3 ;; vec *= 8. Make vec_i count by 8
265+
mov dest1, [dest]
266+
mov dest2, [dest + 8]
267+
268+
cmp len, 64
269+
jb .len_lt_64
270+
271+
.loop64:
272+
ENCODE_64B_2
273+
274+
add pos, 64 ;; Loop on 64 bytes at a time first
275+
sub len, 64
276+
cmp len, 64
277+
jge .loop64
278+
279+
.len_lt_64:
280+
cmp len, 32
281+
jb .len_lt_32
282+
283+
ENCODE_32B_2
284+
285+
add pos, 32 ;; encode next 32 bytes
286+
sub len, 32
287+
288+
.len_lt_32:
289+
cmp len, 0
290+
jle .exit
291+
292+
ENCODE_LT_32B_2 len ;; encode remaining bytes
293+
294+
.exit:
295+
vzeroupper
296+
297+
FUNC_RESTORE
298+
ret
299+
300+
endproc_frame
301+
%endif ; if AS_FEATURE_LEVEL >= 10

0 commit comments

Comments
 (0)