|
47 | 47 |
|
48 | 48 | %define tmp r11 |
49 | 49 | %define tmp2 r10 |
50 | | - %define tmp3 r12 ;must be saved and restored |
51 | | - %define tmp4 r13 ;must be saved and restored |
| 50 | + %define tmp3 r12 ; must be saved and restored |
52 | 51 |
|
| 52 | + %define stack_size 1*8 |
53 | 53 | %define func(x) x: endbranch |
54 | 54 | %macro FUNC_SAVE 0 |
55 | | - push r12 |
56 | | - push r13 |
| 55 | + sub rsp, stack_size |
| 56 | + mov [rsp + 0*8], r12 |
57 | 57 | %endmacro |
58 | 58 | %macro FUNC_RESTORE 0 |
59 | | - pop r13 |
60 | | - pop r12 |
| 59 | + mov r12, [rsp + 0*8] |
| 60 | + add rsp, stack_size |
61 | 61 | %endmacro |
62 | 62 | %endif |
63 | 63 |
|
|
67 | 67 | %define arg2 r8 |
68 | 68 | %define arg3 r9 |
69 | 69 |
|
70 | | - %define arg4 r12 ; must be saved, loaded and restored |
71 | | - %define arg5 r13 ; must be saved and restored |
| 70 | + %define arg4 r12 ; must be saved, loaded and restored |
| 71 | + %define arg5 r15 ; must be saved and restored |
72 | 72 | %define tmp r11 |
73 | 73 | %define tmp2 r10 |
74 | | - %define tmp3 r14 |
75 | | - %define tmp4 r15 |
76 | | - %define stack_size 0*16 + 5*8 ; must be an odd multiple of 8 |
| 74 | + %define tmp3 r13 ; must be saved and restored |
| 75 | + %define stack_size 4*16 + 3*8 ; must be an odd multiple of 8 |
77 | 76 | %define arg(x) [rsp + stack_size + 8 + 8*x] |
78 | 77 |
|
79 | 78 | %define func(x) proc_frame x |
80 | 79 | %macro FUNC_SAVE 0 |
81 | | - alloc_stack stack_size |
82 | | - mov [rsp + 0*8], r12 |
83 | | - mov [rsp + 1*8], r13 |
84 | | - mov [rsp + 2*8], r14 |
85 | | - mov [rsp + 3*8], r15 |
86 | | - end_prolog |
87 | | - mov arg4, arg(4) |
| 80 | + alloc_stack stack_size |
| 81 | + vmovdqa [rsp + 0*16], xmm6 |
| 82 | + vmovdqa [rsp + 1*16], xmm7 |
| 83 | + vmovdqa [rsp + 2*16], xmm8 |
| 84 | + vmovdqa [rsp + 3*16], xmm9 |
| 85 | + mov [rsp + 4*16 + 0*8], r12 |
| 86 | + mov [rsp + 4*16 + 1*8], r13 |
| 87 | + mov [rsp + 4*16 + 2*8], r15 |
| 88 | + end_prolog |
| 89 | + mov arg4, arg(4) |
88 | 90 | %endmacro |
89 | 91 |
|
90 | 92 | %macro FUNC_RESTORE 0 |
91 | | - mov r12, [rsp + 0*8] |
92 | | - mov r13, [rsp + 1*8] |
93 | | - mov r14, [rsp + 2*8] |
94 | | - mov r15, [rsp + 3*8] |
95 | | - add rsp, stack_size |
| 93 | + vmovdqa xmm6, [rsp + 0*16] |
| 94 | + vmovdqa xmm7, [rsp + 1*16] |
| 95 | + vmovdqa xmm8, [rsp + 2*16] |
| 96 | + vmovdqa xmm9, [rsp + 3*16] |
| 97 | + mov r12, [rsp + 4*16 + 0*8] |
| 98 | + mov r13, [rsp + 4*16 + 1*8] |
| 99 | + mov r15, [rsp + 4*16 + 2*8] |
| 100 | + add rsp, stack_size |
96 | 101 | %endmacro |
97 | 102 | %endif |
98 | 103 |
|
|
106 | 111 | %define vec_i tmp2 |
107 | 112 | %define pos rax |
108 | 113 |
|
109 | | - |
110 | 114 | %ifndef EC_ALIGNED_ADDR |
111 | 115 | ;;; Use Un-aligned load/store |
112 | 116 | %define XLDR vmovdqu |
|
122 | 126 | %endif |
123 | 127 | %endif |
124 | 128 |
|
125 | | -%define xgft1 ymm2 |
| 129 | +%define x0l ymm0 |
| 130 | +%define x0h ymm1 |
| 131 | +%define x0x ymm2 |
| 132 | + |
| 133 | +%define xp1l ymm3 |
| 134 | +%define xp1h ymm4 |
| 135 | +%define xp1x ymm5 |
| 136 | + |
| 137 | +%define xgft1 ymm6 |
| 138 | +%define xgft2 ymm7 |
| 139 | +%define xgft3 ymm8 |
126 | 140 |
|
127 | | -%define x0 ymm0 |
128 | | -%define xp1 ymm1 |
| 141 | +%define xtmp1 ymm9 |
| 142 | + |
| 143 | +%define x0 x0l |
| 144 | +%define xp1 xp1l |
| 145 | +%define xp2 xp2l |
| 146 | +%define xp3 xp3l |
129 | 147 |
|
130 | 148 | default rel |
131 | 149 | [bits 64] |
| 150 | + |
132 | 151 | section .text |
133 | 152 |
|
134 | 153 | ;; |
135 | | -;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disk) |
| 154 | +;; Encodes 96 bytes of all "k" sources into 96 bytes (single parity disk) |
| 155 | +;; |
| 156 | +%macro ENCODE_96B 0 |
| 157 | + vpxor xp1l, xp1l, xp1l |
| 158 | + vpxor xp1h, xp1h, xp1h |
| 159 | + vpxor xp1x, xp1x, xp1x |
| 160 | + mov tmp, mul_array |
| 161 | + xor vec_i, vec_i |
| 162 | + |
| 163 | +%%next_vect: |
| 164 | + ;; load next source vector |
| 165 | + mov ptr, [src + vec_i] |
| 166 | + XLDR x0l, [ptr + pos] |
| 167 | + XLDR x0h, [ptr + pos + 32] |
| 168 | + XLDR x0x, [ptr + pos + 64] |
| 169 | + add vec_i, 8 |
| 170 | + |
| 171 | + vbroadcastsd xgft1, [tmp] |
| 172 | + add tmp, 8 |
| 173 | + |
| 174 | + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l |
| 175 | + GF_MUL_XOR VEX, x0h, xgft1, xtmp1, xp1h |
| 176 | + GF_MUL_XOR VEX, x0x, xgft1, xtmp1, xp1x |
| 177 | + |
| 178 | + cmp vec_i, vec |
| 179 | + jl %%next_vect |
| 180 | + |
| 181 | + XSTR [dest1 + pos], xp1l |
| 182 | + XSTR [dest1 + pos + 32], xp1h |
| 183 | + XSTR [dest1 + pos + 64], xp1x |
| 184 | +%endmacro |
| 185 | + |
| 186 | +;; |
| 187 | +;; Encodes 64 bytes of all "k" sources into 64 bytes (single parity disk) |
| 188 | +;; |
| 189 | +%macro ENCODE_64B 0 |
| 190 | + vpxor xp1l, xp1l, xp1l |
| 191 | + vpxor xp1h, xp1h, xp1h |
| 192 | + mov tmp, mul_array |
| 193 | + xor vec_i, vec_i |
| 194 | + |
| 195 | +%%next_vect: |
| 196 | + ;; load next source vector |
| 197 | + mov ptr, [src + vec_i] |
| 198 | + XLDR x0l, [ptr + pos] |
| 199 | + XLDR x0h, [ptr + pos + 32] |
| 200 | + add vec_i, 8 |
| 201 | + |
| 202 | + vbroadcastsd xgft1, [tmp] |
| 203 | + add tmp, 8 |
| 204 | + |
| 205 | + GF_MUL_XOR VEX, x0l, xgft1, xtmp1, xp1l |
| 206 | + GF_MUL_XOR VEX, x0h, xgft1, xgft1, xp1h |
| 207 | + |
| 208 | + cmp vec_i, vec |
| 209 | + jl %%next_vect |
| 210 | + |
| 211 | + XSTR [dest1 + pos], xp1l |
| 212 | + XSTR [dest1 + pos + 32], xp1h |
| 213 | +%endmacro |
| 214 | + |
| 215 | +;; |
| 216 | +;; Encodes 32 bytes of all "k" sources into 32 bytes (single parity disks) |
136 | 217 | ;; |
137 | 218 | %macro ENCODE_32B 0 |
138 | | - vpxor xp1, xp1, xp1 |
139 | | - mov tmp, mul_array |
140 | | - xor vec_i, vec_i |
| 219 | + vpxor xp1, xp1, xp1 |
| 220 | + mov tmp, mul_array |
| 221 | + xor vec_i, vec_i |
141 | 222 |
|
142 | 223 | %%next_vect: |
143 | | - mov ptr, [src + vec_i] |
144 | | - XLDR x0, [ptr + pos] ;Get next source vector (32 bytes) |
145 | | - add vec_i, 8 |
| 224 | + ;; load next source vector |
| 225 | + mov ptr, [src + vec_i] |
| 226 | + XLDR x0, [ptr + pos] |
| 227 | + add vec_i, 8 |
146 | 228 |
|
147 | 229 | vbroadcastsd xgft1, [tmp] |
148 | | - add tmp, 8 |
| 230 | + add tmp, 8 |
149 | 231 |
|
150 | 232 | GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1 |
151 | 233 |
|
152 | | - cmp vec_i, vec |
153 | | - jl %%next_vect |
| 234 | + cmp vec_i, vec |
| 235 | + jl %%next_vect |
154 | 236 |
|
155 | | - XSTR [dest1 + pos], xp1 |
| 237 | + XSTR [dest1 + pos], xp1 |
156 | 238 | %endmacro |
157 | 239 |
|
158 | 240 | ;; |
159 | | -;; Encodes less than 32 bytes of all "k" sources and updates single parity disk |
| 241 | +;; Encodes less than 32 bytes of all "k" sources into single parity disks |
160 | 242 | ;; |
161 | 243 | %macro ENCODE_LT_32B 1 |
162 | | -%define %%LEN %1 |
| 244 | +%define %%LEN %1 |
163 | 245 |
|
164 | | - vpxor xp1, xp1, xp1 |
165 | | - mov tmp, mul_array |
166 | | - xor vec_i, vec_i |
| 246 | + vpxor xp1, xp1, xp1 |
| 247 | + xor vec_i, vec_i |
167 | 248 |
|
168 | 249 | %%next_vect: |
169 | | - mov ptr, [src + vec_i] |
170 | | - simd_load_avx2 x0, ptr + pos, %%LEN, tmp3, tmp4 ;Get next source vector |
171 | | - add vec_i, 8 |
| 250 | + ; get next source vector |
| 251 | + mov ptr, [src + vec_i] |
| 252 | + simd_load_avx2 x0, ptr + pos, %%LEN, tmp, tmp3 |
| 253 | + add vec_i, 8 |
172 | 254 |
|
173 | | - vbroadcastsd xgft1, [tmp] |
174 | | - add tmp, 8 |
| 255 | + vbroadcastsd xgft1, [mul_array] |
| 256 | + add mul_array, 8 |
175 | 257 |
|
176 | 258 | GF_MUL_XOR VEX, x0, xgft1, xgft1, xp1 |
177 | 259 |
|
178 | | - cmp vec_i, vec |
179 | | - jl %%next_vect |
| 260 | + cmp vec_i, vec |
| 261 | + jl %%next_vect |
180 | 262 |
|
181 | | - lea tmp, [dest1 + pos] |
182 | | - simd_store_avx2 tmp, xp1, %%LEN, tmp3, tmp4 ;Store updated encoded data |
| 263 | + ;; Store updated encoded data |
| 264 | + lea ptr, [dest1 + pos] |
| 265 | + simd_store_avx2 ptr, xp1, %%LEN, tmp, vec_i |
183 | 266 | %endmacro |
184 | 267 |
|
185 | 268 | align 16 |
186 | 269 | mk_global gf_vect_dot_prod_avx2_gfni, function |
187 | 270 | func(gf_vect_dot_prod_avx2_gfni) |
188 | | - FUNC_SAVE |
189 | | - xor pos, pos |
190 | | - shl vec, 3 ;vec *= 8. Make vec_i count by 8 |
| 271 | + FUNC_SAVE |
| 272 | + |
| 273 | + xor pos, pos |
| 274 | + shl vec, 3 ;; vec *= 8. Make vec_i count by 8 |
| 275 | + |
| 276 | + cmp len, 96 |
| 277 | + jl .len_lt_96 |
| 278 | + |
| 279 | +.loop96: |
| 280 | + ENCODE_96B |
191 | 281 |
|
192 | | - cmp len, 32 |
193 | | - jb .len_lt_32 |
| 282 | + add pos, 96 ;; Loop on 96 bytes at a time first |
| 283 | + sub len, 96 |
| 284 | + cmp len, 96 |
| 285 | + jge .loop96 |
194 | 286 |
|
195 | | -.loop32: |
| 287 | +.len_lt_96: |
| 288 | + cmp len, 64 |
| 289 | + jl .len_lt_64 |
| 290 | + |
| 291 | + ENCODE_64B |
| 292 | + |
| 293 | + add pos, 64 ;; encode next 64 bytes |
| 294 | + sub len, 64 |
| 295 | + |
| 296 | +.len_lt_64: |
| 297 | + cmp len, 32 |
| 298 | + jl .len_lt_32 |
196 | 299 |
|
197 | 300 | ENCODE_32B |
198 | 301 |
|
199 | | - add pos, 32 ;Loop on 32 bytes at a time |
| 302 | + add pos, 32 ;; encode next 32 bytes |
200 | 303 | sub len, 32 |
201 | | - cmp len, 32 |
202 | | - jge .loop32 |
203 | 304 |
|
204 | 305 | .len_lt_32: |
205 | 306 | cmp len, 0 |
206 | 307 | jle .exit |
207 | 308 |
|
208 | | - ENCODE_LT_32B len |
| 309 | + ENCODE_LT_32B len ;; encode final bytes |
| 310 | + |
209 | 311 | .exit: |
210 | 312 | vzeroupper |
211 | 313 |
|
212 | | - FUNC_RESTORE |
213 | | - ret |
| 314 | + FUNC_RESTORE |
| 315 | + ret |
214 | 316 |
|
215 | 317 | endproc_frame |
216 | 318 | %endif ; if AS_FEATURE_LEVEL >= 10 |
0 commit comments