diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index a8c7f2f9..e27a56c5 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -32,14 +32,13 @@ jobs: run: go vet ./... - name: Test - run: go test -timeout=15m ./... + run: go test -timeout=15m - name: Test Noasm - run: go test -tags=noasm -short&&go test -short -no-avx512&&go test -short -no-avx512 -no-avx2&&go test -no-avx512 -no-avx2 -no-ssse3 -short + run: go test -tags=noasm -short&&go test -short -no-avx512&&go test -short -no-avx512 -no-avx-gfni&&go test -short -no-avx512 -no-avx2&&go test -no-avx512 -no-avx2 -no-ssse3 -short - name: Test Nopshufb - run: go test -tags=nopshufb -short&&go test -tags=nopshufb -short -no-avx512 -no-gfni&&go test -tags=nopshufb -short&&go test -tags=nopshufb -no-avx512 -no-avx2 -no-ssse3 -no-sse2 -short - + run: go test -tags=nopshufb -short&&go test -tags=nopshufb -short -no-avx512&&go test -tags=nopshufb -short -no-avx512 -no-avx-gfni&&go test -tags=nopshufb -no-avx512 -no-avx2 -no-ssse3 -no-sse2 -short - name: Test Race env: @@ -88,6 +87,11 @@ jobs: CGO_ENABLED: 1 run: go test -no-avx512 -short -race . + - name: Test Races, no avx512, no avx-gfni + env: + CGO_ENABLED: 1 + run: go test -no-avx512 -no-avx-gfni -short -race . + - name: Test Races, no avx2 env: CGO_ENABLED: 1 diff --git a/_gen/gen.go b/_gen/gen.go index 30b71780..a8da87db 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -36,6 +36,9 @@ var switchDefsX [inputMax][outputMax]string var switchDefs512 [inputMax][outputMax]string var switchDefsX512 [inputMax][outputMax]string +var switchDefsAvxGFNI [inputMax][outputMax]string +var switchDefsXAvxGFNI [inputMax][outputMax]string + // Prefetch offsets, set to 0 to disable. // Disabled since they appear to be consistently slower. const prefetchSrc = 0 @@ -64,8 +67,6 @@ func main() { RET() genXor() - const perLoopBits = 6 - const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { @@ -74,13 +75,24 @@ func main() { genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) } genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64", i, j), i, j, false) + genMulAvxGFNI(fmt.Sprintf("mulAvxGFNI_%dx%d", i, j), i, j, false) genMulAvx512GFNI(fmt.Sprintf("mulGFNI_%dx%d_64Xor", i, j), i, j, true) + genMulAvxGFNI(fmt.Sprintf("mulAvxGFNI_%dx%dXor", i, j), i, j, true) + if pshufb { genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) } } } + + genSwitch() + genGF16() + genGF8() + Generate() +} + +func genSwitch() { name := "../galois_gen_switch_amd64.go" tag := "// +build !nopshufb\n" if !pshufb { @@ -113,9 +125,8 @@ import ( avx2CodeGen = true maxAvx2Inputs = %d maxAvx2Outputs = %d -minAvx2Size = %d -avxSizeMask = maxInt - (minAvx2Size-1) -)`, inputMax, outputMax, perLoop)) +minAvx2Size = 64 +)`, inputMax, outputMax)) if !pshufb { w.WriteString("\n\nfunc galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`)}\n") @@ -126,7 +137,7 @@ avxSizeMask = maxInt - (minAvx2Size-1) w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := stop-start `) @@ -145,7 +156,7 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) `) @@ -168,7 +179,7 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { w.WriteString(` func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) & (maxInt - (64 - 1)) `) @@ -187,7 +198,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop-start) & avxSizeMask + n := (stop-start) & (maxInt - (64 - 1)) `) @@ -206,9 +217,46 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } `) - genGF16() - genGF8() - Generate() + w.WriteString(` + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & (maxInt - (32 - 1)) + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsAvxGFNI[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop-start) & (maxInt - (32 - 1)) + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsXAvxGFNI[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} +`) } // VPXOR3way will 3-way xor a and b and dst. @@ -218,6 +266,10 @@ func VPXOR3way(a, b, dst reg.VecVirtual) { } func genMulAvx2(name string, inputs int, outputs int, xor bool) { + if outputs < 4 { + // Covered by 64-byte version. + return + } const perLoopBits = 5 const perLoop = 1 << perLoopBits @@ -263,7 +315,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { // SWITCH DEFINITION: s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) - s += fmt.Sprintf("\t\t\t\treturn n\n") + s += fmt.Sprintf("\t\t\t\treturn n & (maxInt - %d)\n", perLoop-1) if xor { switchDefsX[inputs-1][outputs-1] = s } else { @@ -521,7 +573,7 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { // SWITCH DEFINITION: //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x) - s += fmt.Sprintf("\t\t\t\treturn n\n") + s += fmt.Sprintf("\t\t\t\treturn n & (maxInt - %d)\n", perLoop-1) if xor { switchDefsX[inputs-1][outputs-1] = s } else { @@ -959,6 +1011,235 @@ func genMulAvx512GFNI(name string, inputs int, outputs int, xor bool) { RET() } +func genMulAvxGFNI(name string, inputs int, outputs int, xor bool) { + const perLoopBits = 5 + const perLoop = 1 << perLoopBits + + total := inputs * outputs + + doc := []string{ + fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs), + } + if !xor { + doc = append(doc, "The output is initialized to 0.") + } + + // Load shuffle masks on every use. + var loadNone bool + // Use registers for destination registers. + var regDst = true + var reloadLength = false + + est := total + outputs + 2 + // When we can't hold all, keep this many in registers. + inReg := 0 + if est > 16 { + loadNone = true + inReg = 16 - outputs - 2 + // We run out of GP registers first, now. + if inputs+outputs > 13 { + regDst = false + } + // Save one register by reloading length. + if inputs+outputs > 12 && regDst { + reloadLength = true + } + } + + TEXT(name, 0, fmt.Sprintf("func(matrix []uint64, in [][]byte, out [][]byte, start, n int)")) + x := "" + if xor { + x = "Xor" + } + // SWITCH DEFINITION: + //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) + s := fmt.Sprintf(" mulAvxGFNI_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) + s += fmt.Sprintf("\t\t\t\treturn n\n") + if xor { + switchDefsXAvxGFNI[inputs-1][outputs-1] = s + } else { + switchDefsAvxGFNI[inputs-1][outputs-1] = s + } + + if loadNone { + Commentf("Loading %d of %d tables to registers", inReg, inputs*outputs) + } else { + // loadNone == false + Comment("Loading all tables to registers") + } + if regDst { + Comment("Destination kept in GP registers") + } else { + Comment("Destination kept on stack") + } + + Doc(doc...) + Pragma("noescape") + Commentf("Full registers estimated %d YMM used", est) + + length := Load(Param("n"), GP64()) + matrixBase := GP64() + addr, err := Param("matrix").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, matrixBase) + SHRQ(U8(perLoopBits), length) + TESTQ(length, length) + JZ(LabelRef(name + "_end")) + + matrix := make([]reg.VecVirtual, total) + + for i := range matrix { + if loadNone && i >= inReg { + break + } + table := YMM() + VBROADCASTSD(Mem{Base: matrixBase, Disp: i * 8}, table) + matrix[i] = table + } + + inPtrs := make([]reg.GPVirtual, inputs) + inSlicePtr := GP64() + addr, err = Param("in").Base().Resolve() + if err != nil { + panic(err) + } + MOVQ(addr.Addr, inSlicePtr) + for i := range inPtrs { + ptr := GP64() + MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr) + inPtrs[i] = ptr + } + // Destination + dst := make([]reg.VecVirtual, outputs) + dstPtr := make([]reg.GPVirtual, outputs) + addr, err = Param("out").Base().Resolve() + if err != nil { + panic(err) + } + outBase := addr.Addr + outSlicePtr := GP64() + MOVQ(addr.Addr, outSlicePtr) + MOVQ(outBase, outSlicePtr) + for i := range dst { + dst[i] = YMM() + if !regDst { + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + dstPtr[i] = ptr + } + + offset := GP64() + addr, err = Param("start").Resolve() + if err != nil { + panic(err) + } + + MOVQ(addr.Addr, offset) + if regDst { + Comment("Add start offset to output") + for _, ptr := range dstPtr { + ADDQ(offset, ptr) + } + } + + Comment("Add start offset to input") + for _, ptr := range inPtrs { + ADDQ(offset, ptr) + } + // Offset no longer needed unless not regdst + + if reloadLength { + Commentf("Reload length to save a register") + length = Load(Param("n"), GP64()) + SHRQ(U8(perLoopBits), length) + } + Label(name + "_loop") + + if xor { + Commentf("Load %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + } + + in := YMM() + look := YMM() + for i := range inPtrs { + Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs) + VMOVDQU(Mem{Base: inPtrs[i]}, in) + if prefetchSrc > 0 { + PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc}) + } + ADDQ(U8(perLoop), inPtrs[i]) + + for j := range dst { + idx := i*outputs + j + if loadNone && idx >= inReg { + tmp := YMM() + if i == 0 && !xor { + VBROADCASTSD(Mem{Base: matrixBase, Disp: idx * 8}, tmp) + VGF2P8AFFINEQB(U8(0), tmp, in, dst[j]) + } else { + VBROADCASTSD(Mem{Base: matrixBase, Disp: idx * 8}, tmp) + VGF2P8AFFINEQB(U8(0), tmp, in, look) + VXORPD(dst[j], look, dst[j]) + } + } else { + if i == 0 && !xor { + VGF2P8AFFINEQB(U8(0), matrix[i*outputs+j], in, dst[j]) + } else { + VGF2P8AFFINEQB(U8(0), matrix[i*outputs+j], in, look) + VXORPD(dst[j], look, dst[j]) + } + } + } + } + Commentf("Store %d outputs", outputs) + for i := range dst { + if regDst { + VMOVDQU(dst[i], Mem{Base: dstPtr[i]}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) + } + ADDQ(U8(perLoop), dstPtr[i]) + continue + } + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) + VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1}) + if prefetchDst > 0 && !xor { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + Comment("Prepare for next loop") + if !regDst { + ADDQ(U8(perLoop), offset) + } + DECQ(length) + JNZ(LabelRef(name + "_loop")) + VZEROUPPER() + + Label(name + "_end") + RET() +} + func genXor() { // SSE 2 { diff --git a/benchmark/main.go b/benchmark/main.go index 6a70599a..ad21cd6e 100644 --- a/benchmark/main.go +++ b/benchmark/main.go @@ -38,11 +38,12 @@ var ( cpu = flag.Int("cpu", 16, "Set maximum number of cores to use") csv = flag.Bool("csv", false, "Output as CSV") - sSE2 = flag.Bool("sse2", cpuid.CPU.Has(cpuid.SSE2), "Use SSE2") - sSSE3 = flag.Bool("ssse3", cpuid.CPU.Has(cpuid.SSSE3), "Use SSSE3") - aVX2 = flag.Bool("avx2", cpuid.CPU.Has(cpuid.AVX2), "Use AVX2") - aVX512 = flag.Bool("avx512", cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), "Use AVX512") - gNFI = flag.Bool("gfni", cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), "Use AVX512+GFNI") + sSE2 = flag.Bool("sse2", cpuid.CPU.Has(cpuid.SSE2), "Use SSE2") + sSSE3 = flag.Bool("ssse3", cpuid.CPU.Has(cpuid.SSSE3), "Use SSSE3") + aVX2 = flag.Bool("avx2", cpuid.CPU.Has(cpuid.AVX2), "Use AVX2") + aVX512 = flag.Bool("avx512", cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), "Use AVX512") + gNFI = flag.Bool("gfni", cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), "Use AVX512+GFNI") + avx2GNFI = flag.Bool("avx-gfni", cpuid.CPU.Supports(cpuid.AVX2, cpuid.GFNI), "Use AVX+GFNI") ) var codecDefinitions = map[string]struct { @@ -390,6 +391,9 @@ func getOptions(shardSize int) []reedsolomon.Option { if !*gNFI { o = append(o, reedsolomon.WithGFNI(false)) } + if !*avx2GNFI { + o = append(o, reedsolomon.WithAVXGFNI(false)) + } if !*invCache { o = append(o, reedsolomon.WithInversionCache(false)) } diff --git a/galois_amd64.go b/galois_amd64.go index c7ab3663..8099f166 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -225,7 +225,7 @@ func ifftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *optio return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { // Note that these currently require that length is multiple of 64. t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] @@ -380,7 +380,7 @@ func fftDIT48(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe8, o *option return } - if false && o.useGFNI { + if false && o.useAvx512GFNI { t01 := gf2p811dMulMatrices[log_m01] t23 := gf2p811dMulMatrices[log_m23] t02 := gf2p811dMulMatrices[log_m02] diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 43184349..dac9b136 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -15,12 +15,6 @@ func sSE2XorSlice_64(in []byte, out []byte) //go:noescape func avx2XorSlice_64(in []byte, out []byte) -// mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. // @@ -33,27 +27,27 @@ func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -66,27 +60,27 @@ func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -99,15 +93,21 @@ func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. // @@ -126,11 +126,22 @@ func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. // //go:noescape func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. // //go:noescape @@ -148,11 +159,22 @@ func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. // //go:noescape func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. // //go:noescape @@ -170,11 +192,22 @@ func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. // //go:noescape func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. // //go:noescape @@ -192,11 +225,22 @@ func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. // //go:noescape func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. // //go:noescape @@ -214,11 +258,22 @@ func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. // //go:noescape func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. // //go:noescape @@ -236,11 +291,22 @@ func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. // //go:noescape func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. // //go:noescape @@ -258,21 +324,26 @@ func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. // //go:noescape func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. @@ -286,27 +357,27 @@ func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -319,27 +390,27 @@ func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -352,15 +423,21 @@ func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. // @@ -379,11 +456,22 @@ func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. // //go:noescape func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. // //go:noescape @@ -401,11 +489,22 @@ func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. // //go:noescape func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. // //go:noescape @@ -423,11 +522,22 @@ func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. // //go:noescape func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. // //go:noescape @@ -445,11 +555,22 @@ func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. // //go:noescape func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. // //go:noescape @@ -467,11 +588,22 @@ func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. // //go:noescape func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. // //go:noescape @@ -489,11 +621,22 @@ func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. // //go:noescape func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. // //go:noescape @@ -511,21 +654,26 @@ func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. // //go:noescape func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. @@ -539,27 +687,27 @@ func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -572,27 +720,27 @@ func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -605,15 +753,21 @@ func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. // @@ -632,11 +786,22 @@ func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. // //go:noescape func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. // //go:noescape @@ -654,11 +819,22 @@ func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. // //go:noescape func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. // //go:noescape @@ -676,11 +852,22 @@ func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. // //go:noescape func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. // //go:noescape @@ -698,11 +885,22 @@ func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. // //go:noescape func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. // //go:noescape @@ -720,11 +918,22 @@ func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. // //go:noescape func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. // //go:noescape @@ -742,11 +951,22 @@ func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. // //go:noescape func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. // //go:noescape @@ -764,21 +984,26 @@ func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. // //go:noescape func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. @@ -792,27 +1017,27 @@ func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -825,27 +1050,27 @@ func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -858,15 +1083,21 @@ func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. // @@ -885,11 +1116,22 @@ func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. // //go:noescape func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. // //go:noescape @@ -907,11 +1149,22 @@ func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. // //go:noescape func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. // //go:noescape @@ -929,12 +1182,23 @@ func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. // //go:noescape func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. // //go:noescape func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -951,11 +1215,22 @@ func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. // //go:noescape func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. // //go:noescape @@ -973,11 +1248,22 @@ func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. // //go:noescape func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. // //go:noescape @@ -995,11 +1281,22 @@ func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. // //go:noescape func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. // //go:noescape @@ -1017,21 +1314,26 @@ func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. // //go:noescape func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1045,27 +1347,27 @@ func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1078,27 +1380,27 @@ func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1111,15 +1413,21 @@ func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. // @@ -1138,11 +1446,22 @@ func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. // //go:noescape func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. // //go:noescape @@ -1160,11 +1479,22 @@ func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. // //go:noescape func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. // //go:noescape @@ -1182,11 +1512,22 @@ func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. // //go:noescape func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. // //go:noescape @@ -1204,11 +1545,22 @@ func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. // //go:noescape func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. // //go:noescape @@ -1226,11 +1578,22 @@ func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. // //go:noescape func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. // //go:noescape @@ -1248,11 +1611,22 @@ func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. // //go:noescape func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. // //go:noescape @@ -1270,21 +1644,26 @@ func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. // //go:noescape func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1298,27 +1677,27 @@ func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1331,27 +1710,27 @@ func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1364,15 +1743,21 @@ func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. // @@ -1391,11 +1776,22 @@ func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. // //go:noescape func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. // //go:noescape @@ -1413,11 +1809,22 @@ func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. // //go:noescape func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. // //go:noescape @@ -1435,11 +1842,22 @@ func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. // //go:noescape func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. // //go:noescape @@ -1457,11 +1875,22 @@ func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. // //go:noescape func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. // //go:noescape @@ -1479,11 +1908,22 @@ func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. // //go:noescape func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. // //go:noescape @@ -1501,11 +1941,22 @@ func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. // //go:noescape func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. // //go:noescape @@ -1523,21 +1974,26 @@ func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. // //go:noescape func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1551,27 +2007,27 @@ func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1584,27 +2040,27 @@ func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1617,15 +2073,21 @@ func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. // @@ -1644,11 +2106,22 @@ func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. // //go:noescape func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. // //go:noescape @@ -1666,11 +2139,22 @@ func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. // //go:noescape func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. // //go:noescape @@ -1688,11 +2172,22 @@ func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. // //go:noescape func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. // //go:noescape @@ -1710,11 +2205,22 @@ func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. // //go:noescape func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. // //go:noescape @@ -1732,11 +2238,22 @@ func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. // //go:noescape func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. // //go:noescape @@ -1754,11 +2271,22 @@ func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. // //go:noescape func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. // //go:noescape @@ -1776,21 +2304,26 @@ func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. // //go:noescape func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. @@ -1804,27 +2337,27 @@ func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -1837,27 +2370,27 @@ func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -1870,15 +2403,21 @@ func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. // @@ -1897,11 +2436,22 @@ func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. // //go:noescape func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. // //go:noescape @@ -1919,11 +2469,22 @@ func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. // //go:noescape func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. // //go:noescape @@ -1941,11 +2502,22 @@ func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. // //go:noescape func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. // //go:noescape @@ -1963,11 +2535,22 @@ func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. // //go:noescape func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. // //go:noescape @@ -1985,11 +2568,22 @@ func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. // //go:noescape func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. // //go:noescape @@ -2007,11 +2601,22 @@ func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. // //go:noescape func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. // //go:noescape @@ -2029,21 +2634,26 @@ func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. // //go:noescape func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. @@ -2057,27 +2667,27 @@ func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2090,27 +2700,27 @@ func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2123,15 +2733,21 @@ func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int //go:noescape func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. // @@ -2150,11 +2766,22 @@ func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. // //go:noescape func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. // //go:noescape @@ -2172,11 +2799,22 @@ func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. // //go:noescape func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. // //go:noescape @@ -2194,11 +2832,22 @@ func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. // //go:noescape func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. // //go:noescape @@ -2216,11 +2865,22 @@ func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. // //go:noescape func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. // //go:noescape @@ -2238,11 +2898,22 @@ func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. // //go:noescape func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. // //go:noescape @@ -2260,11 +2931,22 @@ func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. // //go:noescape func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. // //go:noescape @@ -2282,21 +2964,26 @@ func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. // //go:noescape func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. -// The output is initialized to 0. +// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. // //go:noescape -func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. @@ -2310,27 +2997,27 @@ func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. // //go:noescape -func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. // @@ -2343,27 +3030,27 @@ func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. // //go:noescape -func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. -// The output is initialized to 0. -// -//go:noescape -func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) - // mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. // @@ -2376,15 +3063,21 @@ func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n in //go:noescape func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. // //go:noescape -func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. // @@ -2403,11 +3096,22 @@ func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. // //go:noescape func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. // //go:noescape @@ -2425,11 +3129,22 @@ func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. // //go:noescape func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. // //go:noescape @@ -2447,11 +3162,22 @@ func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. // //go:noescape func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. // //go:noescape @@ -2469,11 +3195,22 @@ func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. // //go:noescape func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. // //go:noescape @@ -2491,11 +3228,22 @@ func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. // //go:noescape func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. // //go:noescape @@ -2513,11 +3261,22 @@ func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. // //go:noescape func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. // //go:noescape @@ -2535,11 +3294,22 @@ func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. // //go:noescape func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. // //go:noescape diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index b3d0d998..ad253a65 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -110,57 +110,6 @@ end: VZEROUPPER RET -// func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX - - // Add start offset to output - ADDQ BX, DX - - // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_1x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y2 - ADDQ $0x20, CX - VPSRLQ $0x04, Y2, Y4 - VPAND Y3, Y2, Y2 - VPAND Y3, Y4, Y4 - VPSHUFB Y2, Y0, Y2 - VPSHUFB Y4, Y1, Y4 - VPXOR Y2, Y4, Y2 - - // Store 1 outputs - VMOVDQU Y2, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x1_loop - VZEROUPPER - -mulAvxTwo_1x1_end: - RET - // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64(SB), $0-88 @@ -264,6 +213,49 @@ mulGFNI_1x1_64_loop: mulGFNI_1x1_64_end: RET +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 @@ -311,56 +303,51 @@ mulGFNI_1x1_64Xor_loop: mulGFNI_1x1_64Xor_end: RET -// func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), DX - MOVQ start+72(FP), BX + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX // Add start offset to output ADDQ BX, DX // Add start offset to input - ADDQ BX, CX - MOVQ $0x0000000f, BX - MOVQ BX, X3 - VPBROADCASTB X3, Y3 + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 -mulAvxTwo_1x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y4 - ADDQ $0x20, CX - VPSRLQ $0x04, Y4, Y5 - VPAND Y3, Y4, Y4 - VPAND Y3, Y5, Y5 - VMOVDQU (DX), Y2 - VPSHUFB Y4, Y0, Y4 - VPSHUFB Y5, Y1, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 // Store 1 outputs - VMOVDQU Y2, (DX) + VMOVDQU Y1, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x1Xor_loop + JNZ mulAvxGFNI_1x1Xor_loop VZEROUPPER -mulAvxTwo_1x1Xor_end: +mulAvxGFNI_1x1Xor_end: RET // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -427,66 +414,6 @@ mulAvxTwo_1x1_64Xor_loop: mulAvxTwo_1x1_64Xor_end: RET -// func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - ADDQ SI, DX - - // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 - -mulAvxTwo_1x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y6, Y8, Y8 - VPAND Y6, Y9, Y9 - VPSHUFB Y8, Y0, Y5 - VPSHUFB Y9, Y1, Y7 - VPXOR Y5, Y7, Y4 - VPSHUFB Y8, Y2, Y5 - VPSHUFB Y9, Y3, Y7 - VPXOR Y5, Y7, Y5 - - // Store 2 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - VMOVDQU Y5, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x2_loop - VZEROUPPER - -mulAvxTwo_1x2_end: - RET - // func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 @@ -609,6 +536,55 @@ mulGFNI_1x2_64_loop: mulGFNI_1x2_64_end: RET +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 @@ -664,66 +640,59 @@ mulGFNI_1x2_64Xor_loop: mulGFNI_1x2_64Xor_end: RET -// func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ start+72(FP), SI + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX ADDQ SI, DX // Add start offset to input - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X6 - VPBROADCASTB X6, Y6 + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 -mulAvxTwo_1x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y9 - ADDQ $0x20, CX - VPSRLQ $0x04, Y9, Y10 - VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VMOVDQU (BX), Y4 - VPSHUFB Y9, Y0, Y7 - VPSHUFB Y10, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - VMOVDQU (DX), Y5 - VPSHUFB Y9, Y2, Y7 - VPSHUFB Y10, Y3, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 // Store 2 outputs - VMOVDQU Y4, (BX) + VMOVDQU Y2, (BX) ADDQ $0x20, BX - VMOVDQU Y5, (DX) + VMOVDQU Y3, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x2Xor_loop + JNZ mulAvxGFNI_1x2Xor_loop VZEROUPPER -mulAvxTwo_1x2Xor_end: +mulAvxGFNI_1x2Xor_end: RET // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -805,75 +774,6 @@ mulAvxTwo_1x2_64Xor_loop: mulAvxTwo_1x2_64Xor_end: RET -// func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, BX - ADDQ DI, SI - ADDQ DI, DX - - // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_1x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y11 - ADDQ $0x20, CX - VPSRLQ $0x04, Y11, Y12 - VPAND Y9, Y11, Y11 - VPAND Y9, Y12, Y12 - VPSHUFB Y11, Y0, Y8 - VPSHUFB Y12, Y1, Y10 - VPXOR Y8, Y10, Y6 - VPSHUFB Y11, Y2, Y8 - VPSHUFB Y12, Y3, Y10 - VPXOR Y8, Y10, Y7 - VPSHUFB Y11, Y4, Y8 - VPSHUFB Y12, Y5, Y10 - VPXOR Y8, Y10, Y8 - - // Store 3 outputs - VMOVDQU Y6, (BX) - ADDQ $0x20, BX - VMOVDQU Y7, (SI) - ADDQ $0x20, SI - VMOVDQU Y8, (DX) - ADDQ $0x20, DX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_1x3_loop - VZEROUPPER - -mulAvxTwo_1x3_end: - RET - // func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 @@ -1015,6 +915,61 @@ mulGFNI_1x3_64_loop: mulGFNI_1x3_64_end: RET +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 @@ -1078,30 +1033,28 @@ mulGFNI_1x3_64Xor_loop: mulGFNI_1x3_64Xor_end: RET -// func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_1x3Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ start+72(FP), DI + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, BX @@ -1109,45 +1062,38 @@ TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 ADDQ DI, DX // Add start offset to input - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X9 - VPBROADCASTB X9, Y9 + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 -mulAvxTwo_1x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VMOVDQU (BX), Y6 - VPSHUFB Y12, Y0, Y10 - VPSHUFB Y13, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) - VMOVDQU (SI), Y7 - VPSHUFB Y12, Y2, Y10 - VPSHUFB Y13, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) - VMOVDQU (DX), Y8 - VPSHUFB Y12, Y4, Y10 - VPSHUFB Y13, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 // Store 3 outputs - VMOVDQU Y6, (BX) + VMOVDQU Y3, (BX) ADDQ $0x20, BX - VMOVDQU Y7, (SI) + VMOVDQU Y4, (SI) ADDQ $0x20, SI - VMOVDQU Y8, (DX) + VMOVDQU Y5, (DX) ADDQ $0x20, DX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x3Xor_loop + JNZ mulAvxGFNI_1x3Xor_loop VZEROUPPER -mulAvxTwo_1x3Xor_end: +mulAvxGFNI_1x3Xor_end: RET // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1383,6 +1329,67 @@ mulGFNI_1x4_64_loop: mulGFNI_1x4_64_end: RET +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 @@ -1454,6 +1461,77 @@ mulGFNI_1x4_64Xor_loop: mulGFNI_1x4_64Xor_end: RET +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 @@ -1690,6 +1768,73 @@ mulGFNI_1x5_64_loop: mulGFNI_1x5_64_end: RET +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 @@ -1769,6 +1914,85 @@ mulGFNI_1x5_64Xor_loop: mulGFNI_1x5_64Xor_end: RET +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 @@ -2030,6 +2254,79 @@ mulGFNI_1x6_64_loop: mulGFNI_1x6_64_end: RET +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 @@ -2117,6 +2414,93 @@ mulGFNI_1x6_64Xor_loop: mulGFNI_1x6_64Xor_end: RET +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 @@ -2403,6 +2787,85 @@ mulGFNI_1x7_64_loop: mulGFNI_1x7_64_end: RET +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 @@ -2498,6 +2961,101 @@ mulGFNI_1x7_64Xor_loop: mulGFNI_1x7_64Xor_end: RET +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 @@ -2809,6 +3367,91 @@ mulGFNI_1x8_64_loop: mulGFNI_1x8_64_end: RET +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 @@ -2912,6 +3555,109 @@ mulGFNI_1x8_64Xor_loop: mulGFNI_1x8_64Xor_end: RET +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 @@ -3248,6 +3994,97 @@ mulGFNI_1x9_64_loop: mulGFNI_1x9_64_end: RET +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 @@ -3359,6 +4196,117 @@ mulGFNI_1x9_64Xor_loop: mulGFNI_1x9_64Xor_end: RET +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9Xor_loop + VZEROUPPER + +mulAvxGFNI_1x9Xor_end: + RET + // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 @@ -3720,6 +4668,103 @@ mulGFNI_1x10_64_loop: mulGFNI_1x10_64_end: RET +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + // func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 @@ -3839,6 +4884,125 @@ mulGFNI_1x10_64Xor_loop: mulGFNI_1x10_64Xor_end: RET +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 @@ -3981,71 +5145,6 @@ mulAvxTwo_1x10Xor_loop: mulAvxTwo_1x10Xor_end: RET -// func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI - - // Add start offset to output - ADDQ SI, BX - - // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 - -mulAvxTwo_2x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y4 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) - - // Store 1 outputs - VMOVDQU Y4, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x1_loop - VZEROUPPER - -mulAvxTwo_2x1_end: - RET - // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 @@ -4179,6 +5278,58 @@ mulGFNI_2x1_64_loop: mulGFNI_2x1_64_end: RET +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 @@ -4235,70 +5386,60 @@ mulGFNI_2x1_64Xor_loop: mulGFNI_2x1_64Xor_end: RET -// func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), BX - MOVQ start+72(FP), SI + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI // Add start offset to output ADDQ SI, BX // Add start offset to input - ADDQ SI, DX - ADDQ SI, CX - MOVQ $0x0000000f, SI - MOVQ SI, X5 - VPBROADCASTB X5, Y5 + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 -mulAvxTwo_2x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VMOVDQU (BX), Y4 - VPSHUFB Y6, Y0, Y6 - VPSHUFB Y7, Y1, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (CX), Y6 - ADDQ $0x20, CX - VPSRLQ $0x04, Y6, Y7 - VPAND Y5, Y6, Y6 - VPAND Y5, Y7, Y7 - VPSHUFB Y6, Y2, Y6 - VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 // Store 1 outputs - VMOVDQU Y4, (BX) + VMOVDQU Y2, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x1Xor_loop + JNZ mulAvxGFNI_2x1Xor_loop VZEROUPPER -mulAvxTwo_2x1Xor_end: +mulAvxGFNI_2x1Xor_end: RET // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -4386,85 +5527,6 @@ mulAvxTwo_2x1_64Xor_loop: mulAvxTwo_2x1_64Xor_end: RET -// func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - ADDQ DI, BX - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 - -mulAvxTwo_2x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y8 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y9 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) - - // Store 2 outputs - VMOVDQU Y8, (SI) - ADDQ $0x20, SI - VMOVDQU Y9, (BX) - ADDQ $0x20, BX - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x2_loop - VZEROUPPER - -mulAvxTwo_2x2_end: - RET - // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 @@ -4628,6 +5690,67 @@ mulGFNI_2x2_64_loop: mulGFNI_2x2_64_end: RET +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 @@ -4695,85 +5818,71 @@ mulGFNI_2x2_64Xor_loop: mulGFNI_2x2_64Xor_end: RET -// func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 15 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x2Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), CX - MOVQ out_base+48(FP), BX - MOVQ (BX), SI - MOVQ 24(BX), BX - MOVQ start+72(FP), DI + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI ADDQ DI, BX // Add start offset to input - ADDQ DI, DX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X10 - VPBROADCASTB X10, Y10 + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 -mulAvxTwo_2x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y13 - ADDQ $0x20, DX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VMOVDQU (SI), Y8 - VPSHUFB Y13, Y0, Y11 - VPSHUFB Y14, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VMOVDQU (BX), Y9 - VPSHUFB Y13, Y2, Y11 - VPSHUFB Y14, Y3, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (CX), Y13 - ADDQ $0x20, CX - VPSRLQ $0x04, Y13, Y14 - VPAND Y10, Y13, Y13 - VPAND Y10, Y14, Y14 - VPSHUFB Y13, Y4, Y11 - VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) - VPSHUFB Y13, Y6, Y11 - VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 // Store 2 outputs - VMOVDQU Y8, (SI) + VMOVDQU Y4, (SI) ADDQ $0x20, SI - VMOVDQU Y9, (BX) + VMOVDQU Y5, (BX) ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x2Xor_loop + JNZ mulAvxGFNI_2x2Xor_loop VZEROUPPER -mulAvxTwo_2x2Xor_end: +mulAvxGFNI_2x2Xor_end: RET // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -4884,99 +5993,6 @@ mulAvxTwo_2x2_64Xor_loop: mulAvxTwo_2x2_64Xor_end: RET -// func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_2x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_2x3_loop - VZEROUPPER - -mulAvxTwo_2x3_end: - RET - // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 @@ -5170,6 +6186,76 @@ mulGFNI_2x3_64_loop: mulGFNI_2x3_64_end: RET +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 @@ -5248,100 +6334,82 @@ mulGFNI_2x3_64Xor_loop: mulGFNI_2x3_64Xor_end: RET -// func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_2x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), DX - MOVQ out_base+48(FP), SI - MOVQ (SI), DI - MOVQ 24(SI), R8 - MOVQ 48(SI), SI - MOVQ start+72(FP), R9 + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 // Add start offset to output - ADDQ R9, DI - ADDQ R9, R8 - ADDQ R9, SI + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX // Add start offset to input - ADDQ R9, BX - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X3 - VPBROADCASTB X3, Y3 + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 -mulAvxTwo_2x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (DI), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (SI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 // Store 3 outputs - VMOVDQU Y0, (DI) - ADDQ $0x20, DI - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - VMOVDQU Y2, (SI) + VMOVDQU Y6, (SI) ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulAvxTwo_2x3Xor_loop + JNZ mulAvxGFNI_2x3Xor_loop VZEROUPPER -mulAvxTwo_2x3Xor_end: +mulAvxGFNI_2x3Xor_end: RET // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -5661,6 +6729,85 @@ mulGFNI_2x4_64_loop: mulGFNI_2x4_64_end: RET +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 @@ -5750,6 +6897,95 @@ mulGFNI_2x4_64Xor_loop: mulGFNI_2x4_64Xor_end: RET +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 @@ -6070,6 +7306,94 @@ mulGFNI_2x5_64_loop: mulGFNI_2x5_64_end: RET +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 @@ -6170,6 +7494,106 @@ mulGFNI_2x5_64Xor_loop: mulGFNI_2x5_64Xor_end: RET +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 @@ -6528,6 +7952,103 @@ mulGFNI_2x6_64_loop: mulGFNI_2x6_64_end: RET +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 @@ -6639,6 +8160,117 @@ mulGFNI_2x6_64Xor_loop: mulGFNI_2x6_64Xor_end: RET +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 @@ -7035,6 +8667,112 @@ mulGFNI_2x7_64_loop: mulGFNI_2x7_64_end: RET +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 @@ -7157,6 +8895,128 @@ mulGFNI_2x7_64Xor_loop: mulGFNI_2x7_64Xor_end: RET +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 @@ -7591,6 +9451,121 @@ mulGFNI_2x8_64_loop: mulGFNI_2x8_64_end: RET +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 @@ -7724,6 +9699,139 @@ mulGFNI_2x8_64Xor_loop: mulGFNI_2x8_64Xor_end: RET +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 @@ -8196,6 +10304,130 @@ mulGFNI_2x9_64_loop: mulGFNI_2x9_64_end: RET +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 @@ -8340,6 +10572,150 @@ mulGFNI_2x9_64Xor_loop: mulGFNI_2x9_64Xor_end: RET +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 @@ -8850,6 +11226,139 @@ mulGFNI_2x10_64_loop: mulGFNI_2x10_64_end: RET +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 @@ -9005,6 +11514,161 @@ mulGFNI_2x10_64Xor_loop: mulGFNI_2x10_64Xor_end: RET +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 @@ -9206,85 +11870,6 @@ mulAvxTwo_2x10Xor_loop: mulAvxTwo_2x10Xor_end: RET -// func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI - - // Add start offset to output - ADDQ DI, SI - - // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 - -mulAvxTwo_3x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y6 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) - - // Store 1 outputs - VMOVDQU Y6, (SI) - ADDQ $0x20, SI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x1_loop - VZEROUPPER - -mulAvxTwo_3x1_end: - RET - // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 @@ -9448,6 +12033,67 @@ mulGFNI_3x1_64_loop: mulGFNI_3x1_64_end: RET +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 @@ -9513,84 +12159,69 @@ mulGFNI_3x1_64Xor_loop: mulGFNI_3x1_64Xor_end: RET -// func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 10 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), CX - MOVQ out_base+48(FP), SI - MOVQ (SI), SI - MOVQ start+72(FP), DI + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI // Add start offset to output ADDQ DI, SI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, CX - MOVQ $0x0000000f, DI - MOVQ DI, X7 - VPBROADCASTB X7, Y7 + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 -mulAvxTwo_3x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y8 - ADDQ $0x20, DX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VMOVDQU (SI), Y6 - VPSHUFB Y8, Y0, Y8 - VPSHUFB Y9, Y1, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y8 - ADDQ $0x20, BX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y2, Y8 - VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (CX), Y8 - ADDQ $0x20, CX - VPSRLQ $0x04, Y8, Y9 - VPAND Y7, Y8, Y8 - VPAND Y7, Y9, Y9 - VPSHUFB Y8, Y4, Y8 - VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 // Store 1 outputs - VMOVDQU Y6, (SI) + VMOVDQU Y3, (SI) ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x1Xor_loop + JNZ mulAvxGFNI_3x1Xor_loop VZEROUPPER -mulAvxTwo_3x1Xor_end: +mulAvxGFNI_3x1Xor_end: RET // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -9699,104 +12330,6 @@ mulAvxTwo_3x1_64Xor_loop: mulAvxTwo_3x1_64Xor_end: RET -// func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI - - // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_3x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x2_loop - VZEROUPPER - -mulAvxTwo_3x2_end: - RET - // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 @@ -10001,6 +12534,79 @@ mulGFNI_3x2_64_loop: mulGFNI_3x2_64_end: RET +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 @@ -10080,104 +12686,83 @@ mulGFNI_3x2_64Xor_loop: mulGFNI_3x2_64Xor_end: RET -// func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 19 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), DI - MOVQ start+72(FP), R9 + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 // Add start offset to output - ADDQ R9, R8 - ADDQ R9, DI + ADDQ R8, DI + ADDQ R8, SI // Add start offset to input - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DX - MOVQ $0x0000000f, R9 - MOVQ R9, X2 - VPBROADCASTB X2, Y2 + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 -mulAvxTwo_3x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (DI), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 // Store 2 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (DI) + VMOVDQU Y6, (DI) ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x2Xor_loop + JNZ mulAvxGFNI_3x2Xor_loop VZEROUPPER -mulAvxTwo_3x2Xor_end: +mulAvxGFNI_3x2Xor_end: RET // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -10317,123 +12902,6 @@ mulAvxTwo_3x2_64Xor_loop: mulAvxTwo_3x2_64Xor_end: RET -// func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_3x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_3x3_loop - VZEROUPPER - -mulAvxTwo_3x3_end: - RET - // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 @@ -10679,6 +13147,91 @@ mulGFNI_3x3_64_loop: mulGFNI_3x3_64_end: RET +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 @@ -10772,124 +13325,97 @@ mulGFNI_3x3_64Xor_loop: mulGFNI_3x3_64Xor_end: RET -// func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 26 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_3x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DX - MOVQ out_base+48(FP), DI - MOVQ (DI), R8 - MOVQ 24(DI), R9 - MOVQ 48(DI), DI - MOVQ start+72(FP), R10 + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R8 - ADDQ R10, R9 - ADDQ R10, DI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X3 - VPBROADCASTB X3, Y3 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 -mulAvxTwo_3x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R8), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (DI), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 // Store 3 outputs - VMOVDQU Y0, (R8) - ADDQ $0x20, R8 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - VMOVDQU Y2, (DI) + VMOVDQU Y9, (DI) ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_3x3Xor_loop + JNZ mulAvxGFNI_3x3Xor_loop VZEROUPPER -mulAvxTwo_3x3Xor_end: +mulAvxGFNI_3x3Xor_end: RET // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -11293,6 +13819,103 @@ mulGFNI_3x4_64_loop: mulGFNI_3x4_64_end: RET +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 @@ -11400,6 +14023,113 @@ mulGFNI_3x4_64Xor_loop: mulGFNI_3x4_64Xor_end: RET +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 @@ -11804,6 +14534,115 @@ mulGFNI_3x5_64_loop: mulGFNI_3x5_64_end: RET +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 @@ -11925,6 +14764,127 @@ mulGFNI_3x5_64Xor_loop: mulGFNI_3x5_64Xor_end: RET +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 @@ -12380,6 +15340,127 @@ mulGFNI_3x6_64_loop: mulGFNI_3x6_64_end: RET +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 @@ -12515,6 +15596,141 @@ mulGFNI_3x6_64Xor_loop: mulGFNI_3x6_64Xor_end: RET +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 @@ -13021,6 +16237,139 @@ mulGFNI_3x7_64_loop: mulGFNI_3x7_64_end: RET +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 @@ -13170,6 +16519,155 @@ mulGFNI_3x7_64Xor_loop: mulGFNI_3x7_64Xor_end: RET +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 @@ -13725,6 +17223,151 @@ mulGFNI_3x8_64_loop: mulGFNI_3x8_64_end: RET +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 @@ -13886,6 +17529,169 @@ mulGFNI_3x8_64Xor_loop: mulGFNI_3x8_64Xor_end: RET +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 @@ -14488,6 +18294,163 @@ mulGFNI_3x9_64_loop: mulGFNI_3x9_64_end: RET +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 @@ -14659,6 +18622,183 @@ mulGFNI_3x9_64Xor_loop: mulGFNI_3x9_64Xor_end: RET +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 @@ -15314,6 +19454,179 @@ mulGFNI_3x10_64_loop: mulGFNI_3x10_64_end: RET +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 @@ -15499,6 +19812,201 @@ mulGFNI_3x10_64Xor_loop: mulGFNI_3x10_64Xor_end: RET +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 @@ -15761,99 +20269,6 @@ mulAvxTwo_3x10Xor_loop: mulAvxTwo_3x10Xor_end: RET -// func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 - - // Add start offset to output - ADDQ R8, DI - - // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 - -mulAvxTwo_4x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y8 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) - - // Store 1 outputs - VMOVDQU Y8, (DI) - ADDQ $0x20, DI - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x1_loop - VZEROUPPER - -mulAvxTwo_4x1_end: - RET - // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 @@ -16047,6 +20462,76 @@ mulGFNI_4x1_64_loop: mulGFNI_4x1_64_end: RET +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 @@ -16121,98 +20606,78 @@ mulGFNI_4x1_64Xor_loop: mulGFNI_4x1_64Xor_end: RET -// func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 12 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), CX - MOVQ out_base+48(FP), DI - MOVQ (DI), DI - MOVQ start+72(FP), R8 + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 // Add start offset to output ADDQ R8, DI // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, CX - MOVQ $0x0000000f, R8 - MOVQ R8, X9 - VPBROADCASTB X9, Y9 + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 -mulAvxTwo_4x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y10 - ADDQ $0x20, DX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VMOVDQU (DI), Y8 - VPSHUFB Y10, Y0, Y10 - VPSHUFB Y11, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y10 - ADDQ $0x20, BX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y2, Y10 - VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y10 - ADDQ $0x20, SI - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y4, Y10 - VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (CX), Y10 - ADDQ $0x20, CX - VPSRLQ $0x04, Y10, Y11 - VPAND Y9, Y10, Y10 - VPAND Y9, Y11, Y11 - VPSHUFB Y10, Y6, Y10 - VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 // Store 1 outputs - VMOVDQU Y8, (DI) + VMOVDQU Y4, (DI) ADDQ $0x20, DI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x1Xor_loop + JNZ mulAvxGFNI_4x1Xor_loop VZEROUPPER -mulAvxTwo_4x1Xor_end: +mulAvxGFNI_4x1Xor_end: RET // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -16342,123 +20807,6 @@ mulAvxTwo_4x1_64Xor_loop: mulAvxTwo_4x1_64Xor_end: RET -// func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 - - // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_4x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x2_loop - VZEROUPPER - -mulAvxTwo_4x2_end: - RET - // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 @@ -16704,6 +21052,91 @@ mulGFNI_4x2_64_loop: mulGFNI_4x2_64_end: RET +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 @@ -16795,123 +21228,95 @@ mulGFNI_4x2_64Xor_loop: mulGFNI_4x2_64Xor_end: RET -// func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 23 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R8 - MOVQ start+72(FP), R10 + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 // Add start offset to output - ADDQ R10, R9 - ADDQ R10, R8 + ADDQ R9, R8 + ADDQ R9, DI // Add start offset to input - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, DX - MOVQ $0x0000000f, R10 - MOVQ R10, X2 - VPBROADCASTB X2, Y2 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 -mulAvxTwo_4x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R8), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 // Store 2 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R8) + VMOVDQU Y8, (R8) ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x2Xor_loop + JNZ mulAvxGFNI_4x2Xor_loop VZEROUPPER -mulAvxTwo_4x2Xor_end: +mulAvxGFNI_4x2Xor_end: RET // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -17080,147 +21485,6 @@ mulAvxTwo_4x2_64Xor_loop: mulAvxTwo_4x2_64Xor_end: RET -// func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R9 - ADDQ R11, R10 - ADDQ R11, R8 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_4x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R9) - ADDQ $0x20, R9 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - VMOVDQU Y2, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_4x3_loop - VZEROUPPER - -mulAvxTwo_4x3_end: - RET - // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 @@ -17518,6 +21782,106 @@ mulGFNI_4x3_64_loop: mulGFNI_4x3_64_end: RET +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 @@ -17626,27 +21990,39 @@ mulGFNI_4x3_64Xor_loop: mulGFNI_4x3_64Xor_end: RET -// func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers // Destination kept in GP registers - // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_4x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R9 - MOVQ 24(R8), R10 - MOVQ 48(R8), R8 - MOVQ start+72(FP), R11 + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 // Add start offset to output ADDQ R11, R9 @@ -17654,120 +22030,72 @@ TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 ADDQ R11, R8 // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X3 - VPBROADCASTB X3, Y3 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 -mulAvxTwo_4x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R9), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R8), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R9) + VMOVDQU Y11, (R9) ADDQ $0x20, R9 - VMOVDQU Y1, (R10) + VMOVDQU Y12, (R10) ADDQ $0x20, R10 - VMOVDQU Y2, (R8) + VMOVDQU Y13, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_4x3Xor_loop + JNZ mulAvxGFNI_4x3Xor_loop VZEROUPPER -mulAvxTwo_4x3Xor_end: +mulAvxGFNI_4x3Xor_end: RET // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -18255,6 +22583,121 @@ mulGFNI_4x4_64_loop: mulGFNI_4x4_64_end: RET +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 @@ -18380,6 +22823,131 @@ mulGFNI_4x4_64Xor_loop: mulGFNI_4x4_64Xor_end: RET +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 @@ -18868,6 +23436,136 @@ mulGFNI_4x5_64_loop: mulGFNI_4x5_64_end: RET +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 @@ -19010,6 +23708,148 @@ mulGFNI_4x5_64Xor_loop: mulGFNI_4x5_64Xor_end: RET +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 @@ -19562,6 +24402,151 @@ mulGFNI_4x6_64_loop: mulGFNI_4x6_64_end: RET +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 @@ -19721,6 +24706,165 @@ mulGFNI_4x6_64Xor_loop: mulGFNI_4x6_64Xor_end: RET +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 @@ -20332,6 +25476,166 @@ mulGFNI_4x7_64_loop: mulGFNI_4x7_64_end: RET +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 @@ -20503,6 +25807,182 @@ mulGFNI_4x7_64Xor_loop: mulGFNI_4x7_64Xor_end: RET +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 @@ -21173,6 +26653,181 @@ mulGFNI_4x8_64_loop: mulGFNI_4x8_64_end: RET +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 @@ -21356,6 +27011,199 @@ mulGFNI_4x8_64Xor_loop: mulGFNI_4x8_64Xor_end: RET +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 @@ -22091,6 +27939,200 @@ mulGFNI_4x9_64_loop: mulGFNI_4x9_64_end: RET +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 @@ -22290,6 +28332,220 @@ mulGFNI_4x9_64Xor_loop: mulGFNI_4x9_64Xor_end: RET +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 @@ -23038,6 +29294,190 @@ mulGFNI_4x10_64_loop: mulGFNI_4x10_64_end: RET +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + // func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 @@ -23234,6 +29674,222 @@ mulGFNI_4x10_64Xor_loop: mulGFNI_4x10_64Xor_end: RET +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10Xor_loop + VZEROUPPER + +mulAvxGFNI_4x10Xor_end: + RET + // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 @@ -23542,113 +30198,6 @@ mulAvxTwo_4x10Xor_loop: mulAvxTwo_4x10Xor_end: RET -// func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 - - // Add start offset to output - ADDQ R9, R8 - - // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 - -mulAvxTwo_5x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y10 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) - - // Store 1 outputs - VMOVDQU Y10, (R8) - ADDQ $0x20, R8 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x1_loop - VZEROUPPER - -mulAvxTwo_5x1_end: - RET - // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 @@ -23872,6 +30421,85 @@ mulGFNI_5x1_64_loop: mulGFNI_5x1_64_end: RET +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 @@ -23955,112 +30583,87 @@ mulGFNI_5x1_64Xor_loop: mulGFNI_5x1_64Xor_end: RET -// func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 14 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), CX - MOVQ out_base+48(FP), R8 - MOVQ (R8), R8 - MOVQ start+72(FP), R9 + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 // Add start offset to output ADDQ R9, R8 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, CX - MOVQ $0x0000000f, R9 - MOVQ R9, X11 - VPBROADCASTB X11, Y11 + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 -mulAvxTwo_5x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y12 - ADDQ $0x20, DX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VMOVDQU (R8), Y10 - VPSHUFB Y12, Y0, Y12 - VPSHUFB Y13, Y1, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y12 - ADDQ $0x20, BX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y2, Y12 - VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y12 - ADDQ $0x20, SI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y4, Y12 - VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y12 - ADDQ $0x20, DI - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y6, Y12 - VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (CX), Y12 - ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 - VPAND Y11, Y12, Y12 - VPAND Y11, Y13, Y13 - VPSHUFB Y12, Y8, Y12 - VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 // Store 1 outputs - VMOVDQU Y10, (R8) + VMOVDQU Y5, (R8) ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x1Xor_loop + JNZ mulAvxGFNI_5x1Xor_loop VZEROUPPER -mulAvxTwo_5x1Xor_end: +mulAvxGFNI_5x1Xor_end: RET // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -24211,142 +30814,6 @@ mulAvxTwo_5x1_64Xor_loop: mulAvxTwo_5x1_64Xor_end: RET -// func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 - - // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 - - // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_5x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x2_loop - VZEROUPPER - -mulAvxTwo_5x2_end: - RET - // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 @@ -24633,6 +31100,103 @@ mulGFNI_5x2_64_loop: mulGFNI_5x2_64_end: RET +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 @@ -24736,142 +31300,107 @@ mulGFNI_5x2_64Xor_loop: mulGFNI_5x2_64Xor_end: RET -// func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 27 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R9 - MOVQ start+72(FP), R11 + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 // Add start offset to output - ADDQ R11, R10 - ADDQ R11, R9 + ADDQ R10, R9 + ADDQ R10, R8 // Add start offset to input - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, DX - MOVQ $0x0000000f, R11 - MOVQ R11, X2 - VPBROADCASTB X2, Y2 + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 -mulAvxTwo_5x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R9), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 // Store 2 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R9) + VMOVDQU Y10, (R9) ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x2Xor_loop + JNZ mulAvxGFNI_5x2Xor_loop VZEROUPPER -mulAvxTwo_5x2Xor_end: +mulAvxGFNI_5x2Xor_end: RET // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -25069,171 +31598,6 @@ mulAvxTwo_5x2_64Xor_loop: mulAvxTwo_5x2_64Xor_end: RET -// func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R10 - ADDQ R12, R11 - ADDQ R12, R9 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_5x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R10) - ADDQ $0x20, R10 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - VMOVDQU Y2, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_5x3_loop - VZEROUPPER - -mulAvxTwo_5x3_end: - RET - // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 @@ -25583,6 +31947,121 @@ mulGFNI_5x3_64_loop: mulGFNI_5x3_64_end: RET +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 @@ -25706,28 +32185,40 @@ mulGFNI_5x3_64Xor_loop: mulGFNI_5x3_64Xor_end: RET -// func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers // Destination kept in GP registers - // Full registers estimated 38 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_5x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), DX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R10 - MOVQ 24(R9), R11 - MOVQ 48(R9), R9 - MOVQ start+72(FP), R12 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R10 @@ -25735,143 +32226,86 @@ TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 ADDQ R12, R9 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X3 - VPBROADCASTB X3, Y3 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 -mulAvxTwo_5x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R10), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R9), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R10) + VMOVDQU Y11, (R10) ADDQ $0x20, R10 - VMOVDQU Y1, (R11) + VMOVDQU Y12, (R11) ADDQ $0x20, R11 - VMOVDQU Y2, (R9) + VMOVDQU Y13, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_5x3Xor_loop + JNZ mulAvxGFNI_5x3Xor_loop VZEROUPPER -mulAvxTwo_5x3Xor_end: +mulAvxGFNI_5x3Xor_end: RET // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -26443,6 +32877,139 @@ mulGFNI_5x4_64_loop: mulGFNI_5x4_64_end: RET +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 @@ -26586,6 +33153,149 @@ mulGFNI_5x4_64Xor_loop: mulGFNI_5x4_64Xor_end: RET +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 @@ -27158,6 +33868,157 @@ mulGFNI_5x5_64_loop: mulGFNI_5x5_64_end: RET +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 @@ -27321,6 +34182,169 @@ mulGFNI_5x5_64Xor_loop: mulGFNI_5x5_64Xor_end: RET +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 @@ -27964,6 +34988,175 @@ mulGFNI_5x6_64_loop: mulGFNI_5x6_64_end: RET +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 @@ -28141,6 +35334,189 @@ mulGFNI_5x6_64Xor_loop: mulGFNI_5x6_64Xor_end: RET +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 @@ -28855,6 +36231,193 @@ mulGFNI_5x7_64_loop: mulGFNI_5x7_64_end: RET +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 @@ -29046,6 +36609,209 @@ mulGFNI_5x7_64Xor_loop: mulGFNI_5x7_64Xor_end: RET +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 @@ -29837,6 +37603,215 @@ mulGFNI_5x8_64_loop: mulGFNI_5x8_64_end: RET +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 @@ -30046,6 +38021,233 @@ mulGFNI_5x8_64Xor_loop: mulGFNI_5x8_64Xor_end: RET +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 @@ -30866,6 +39068,210 @@ mulGFNI_5x9_64_loop: mulGFNI_5x9_64_end: RET +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 @@ -31075,6 +39481,239 @@ mulGFNI_5x9_64Xor_loop: mulGFNI_5x9_64Xor_end: RET +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 @@ -31950,6 +40589,226 @@ mulGFNI_5x10_64_loop: mulGFNI_5x10_64_end: RET +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 @@ -32172,6 +41031,258 @@ mulGFNI_5x10_64Xor_loop: mulGFNI_5x10_64Xor_end: RET +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 @@ -32539,127 +41650,6 @@ mulAvxTwo_5x10Xor_loop: mulAvxTwo_5x10Xor_end: RET -// func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 - // Loading all tables to registers - // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 - - // Add start offset to output - ADDQ R10, R9 - - // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 - -mulAvxTwo_6x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y12 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) - - // Store 1 outputs - VMOVDQU Y12, (R9) - ADDQ $0x20, R9 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x1_loop - VZEROUPPER - -mulAvxTwo_6x1_end: - RET - // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 @@ -32913,6 +41903,94 @@ mulGFNI_6x1_64_loop: mulGFNI_6x1_64_end: RET +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 @@ -33005,126 +42083,96 @@ mulGFNI_6x1_64Xor_loop: mulGFNI_6x1_64Xor_end: RET -// func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 16 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x1Xor_end - VMOVDQU (CX), Y0 - VMOVDQU 32(CX), Y1 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VMOVDQU 192(CX), Y6 - VMOVDQU 224(CX), Y7 - VMOVDQU 256(CX), Y8 - VMOVDQU 288(CX), Y9 - VMOVDQU 320(CX), Y10 - VMOVDQU 352(CX), Y11 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), CX - MOVQ out_base+48(FP), R9 - MOVQ (R9), R9 - MOVQ start+72(FP), R10 + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 // Add start offset to output ADDQ R10, R9 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, CX - MOVQ $0x0000000f, R10 - MOVQ R10, X13 - VPBROADCASTB X13, Y13 + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 -mulAvxTwo_6x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y14 - ADDQ $0x20, DX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VMOVDQU (R9), Y12 - VPSHUFB Y14, Y0, Y14 - VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y14 - ADDQ $0x20, BX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y2, Y14 - VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y14 - ADDQ $0x20, SI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y4, Y14 - VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y14 - ADDQ $0x20, DI - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y6, Y14 - VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y14 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y8, Y14 - VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (CX), Y14 - ADDQ $0x20, CX - VPSRLQ $0x04, Y14, Y15 - VPAND Y13, Y14, Y14 - VPAND Y13, Y15, Y15 - VPSHUFB Y14, Y10, Y14 - VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 // Store 1 outputs - VMOVDQU Y12, (R9) + VMOVDQU Y6, (R9) ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x1Xor_loop + JNZ mulAvxGFNI_6x1Xor_loop VZEROUPPER -mulAvxTwo_6x1Xor_end: +mulAvxGFNI_6x1Xor_end: RET // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -33296,161 +42344,6 @@ mulAvxTwo_6x1_64Xor_loop: mulAvxTwo_6x1_64Xor_end: RET -// func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_6x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x2_loop - VZEROUPPER - -mulAvxTwo_6x2_end: - RET - // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 @@ -33778,6 +42671,115 @@ mulGFNI_6x2_64_loop: mulGFNI_6x2_64_end: RET +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 @@ -33893,161 +42895,119 @@ mulGFNI_6x2_64Xor_loop: mulGFNI_6x2_64Xor_end: RET -// func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 31 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R10 - MOVQ start+72(FP), R12 + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 // Add start offset to output - ADDQ R12, R11 - ADDQ R12, R10 + ADDQ R11, R10 + ADDQ R11, R9 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X2 - VPBROADCASTB X2, Y2 + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 -mulAvxTwo_6x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R10), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R10) + VMOVDQU Y12, (R10) ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x2Xor_loop + JNZ mulAvxGFNI_6x2Xor_loop VZEROUPPER -mulAvxTwo_6x2Xor_end: +mulAvxGFNI_6x2Xor_end: RET // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -34274,195 +43234,6 @@ mulAvxTwo_6x2_64Xor_loop: mulAvxTwo_6x2_64Xor_end: RET -// func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R11 - ADDQ R13, R12 - ADDQ R13, R10 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_6x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - VMOVDQU Y2, (R10) - ADDQ $0x20, R10 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_6x3_loop - VZEROUPPER - -mulAvxTwo_6x3_end: - RET - // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 @@ -34864,6 +43635,136 @@ mulGFNI_6x3_64_loop: mulGFNI_6x3_64_end: RET +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 @@ -35002,29 +43903,41 @@ mulGFNI_6x3_64Xor_loop: mulGFNI_6x3_64Xor_end: RET -// func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers // Destination kept in GP registers - // Full registers estimated 44 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_6x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), DX - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R10 - MOVQ start+72(FP), R13 + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R11 @@ -35032,166 +43945,100 @@ TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 ADDQ R13, R10 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X3 - VPBROADCASTB X3, Y3 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 -mulAvxTwo_6x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R10), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R11) + VMOVDQU Y11, (R11) ADDQ $0x20, R11 - VMOVDQU Y1, (R12) + VMOVDQU Y12, (R12) ADDQ $0x20, R12 - VMOVDQU Y2, (R10) + VMOVDQU Y13, (R10) ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_6x3Xor_loop + JNZ mulAvxGFNI_6x3Xor_loop VZEROUPPER -mulAvxTwo_6x3Xor_end: +mulAvxGFNI_6x3Xor_end: RET // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -35847,6 +44694,157 @@ mulGFNI_6x4_64_loop: mulGFNI_6x4_64_end: RET +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 @@ -36008,6 +45006,167 @@ mulGFNI_6x4_64Xor_loop: mulGFNI_6x4_64Xor_end: RET +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 @@ -36659,6 +45818,178 @@ mulGFNI_6x5_64_loop: mulGFNI_6x5_64_end: RET +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 @@ -36838,6 +46169,190 @@ mulGFNI_6x5_64Xor_loop: mulGFNI_6x5_64Xor_end: RET +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 @@ -37572,6 +47087,199 @@ mulGFNI_6x6_64_loop: mulGFNI_6x6_64_end: RET +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 @@ -37767,6 +47475,213 @@ mulGFNI_6x6_64Xor_loop: mulGFNI_6x6_64Xor_end: RET +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 @@ -38590,6 +48505,224 @@ mulGFNI_6x7_64_loop: mulGFNI_6x7_64_end: RET +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 @@ -38805,6 +48938,240 @@ mulGFNI_6x7_64Xor_loop: mulGFNI_6x7_64Xor_end: RET +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 @@ -39673,6 +50040,224 @@ mulGFNI_6x8_64_loop: mulGFNI_6x8_64_end: RET +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 @@ -39891,6 +50476,250 @@ mulGFNI_6x8_64Xor_loop: mulGFNI_6x8_64Xor_end: RET +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 @@ -40827,6 +51656,243 @@ mulGFNI_6x9_64_loop: mulGFNI_6x9_64_end: RET +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 @@ -41060,6 +52126,272 @@ mulGFNI_6x9_64Xor_loop: mulGFNI_6x9_64Xor_end: RET +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 @@ -42074,6 +53406,262 @@ mulGFNI_6x10_64_loop: mulGFNI_6x10_64_end: RET +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 @@ -42322,6 +53910,294 @@ mulGFNI_6x10_64Xor_loop: mulGFNI_6x10_64Xor_end: RET +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 @@ -42748,141 +54624,6 @@ mulAvxTwo_6x10Xor_loop: mulAvxTwo_6x10Xor_end: RET -// func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 18 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 - - // Add start offset to output - ADDQ R12, R11 - - // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_7x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x1_loop - VZEROUPPER - -mulAvxTwo_7x1_end: - RET - // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 @@ -43166,6 +54907,103 @@ mulGFNI_7x1_64_loop: mulGFNI_7x1_64_end: RET +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 @@ -43267,140 +55105,105 @@ mulGFNI_7x1_64Xor_loop: mulGFNI_7x1_64Xor_end: RET -// func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 18 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 // Add start offset to output - ADDQ R12, R11 + ADDQ R11, R10 // Add start offset to input - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, DX - MOVQ $0x0000000f, R12 - MOVQ R12, X1 - VPBROADCASTB X1, Y1 + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 -mulAvxTwo_7x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R11), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 // Store 1 outputs - VMOVDQU Y0, (R11) - ADDQ $0x20, R11 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_7x1Xor_loop + JNZ mulAvxGFNI_7x1Xor_loop VZEROUPPER -mulAvxTwo_7x1Xor_end: +mulAvxGFNI_7x1Xor_end: RET // func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -43593,180 +55396,6 @@ mulAvxTwo_7x1_64Xor_loop: mulAvxTwo_7x1_64Xor_end: RET -// func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - ADDQ R13, R11 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_7x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x2_loop - VZEROUPPER - -mulAvxTwo_7x2_end: - RET - // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 @@ -44135,6 +55764,127 @@ mulGFNI_7x2_64_loop: mulGFNI_7x2_64_end: RET +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 @@ -44262,180 +56012,131 @@ mulGFNI_7x2_64Xor_loop: mulGFNI_7x2_64Xor_end: RET -// func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers // Destination kept in GP registers - // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R11 - MOVQ start+72(FP), R13 + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 // Add start offset to output ADDQ R13, R12 ADDQ R13, R11 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X2 - VPBROADCASTB X2, Y2 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 -mulAvxTwo_7x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R11), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R12) + VMOVDQU Y12, (R12) ADDQ $0x20, R12 - VMOVDQU Y1, (R11) + VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_7x2Xor_loop + JNZ mulAvxGFNI_7x2Xor_loop VZEROUPPER -mulAvxTwo_7x2Xor_end: +mulAvxGFNI_7x2Xor_end: RET // func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -44691,219 +56392,6 @@ mulAvxTwo_7x2_64Xor_loop: mulAvxTwo_7x2_64Xor_end: RET -// func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R12 - ADDQ R14, R13 - ADDQ R14, R11 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_7x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - VMOVDQU Y2, (R11) - ADDQ $0x20, R11 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_7x3_loop - VZEROUPPER - -mulAvxTwo_7x3_end: - RET - // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 @@ -45357,6 +56845,151 @@ mulGFNI_7x3_64_loop: mulGFNI_7x3_64_end: RET +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 @@ -45510,30 +57143,42 @@ mulGFNI_7x3_64Xor_loop: mulGFNI_7x3_64Xor_end: RET -// func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers // Destination kept in GP registers - // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_7x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), DX - MOVQ out_base+48(FP), R11 - MOVQ (R11), R12 - MOVQ 24(R11), R13 - MOVQ 48(R11), R11 - MOVQ start+72(FP), R14 + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R12 @@ -45541,189 +57186,114 @@ TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 ADDQ R14, R11 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X3 - VPBROADCASTB X3, Y3 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 -mulAvxTwo_7x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R11), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R12) + VMOVDQU Y11, (R12) ADDQ $0x20, R12 - VMOVDQU Y1, (R13) + VMOVDQU Y12, (R13) ADDQ $0x20, R13 - VMOVDQU Y2, (R11) + VMOVDQU Y13, (R11) ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_7x3Xor_loop + JNZ mulAvxGFNI_7x3Xor_loop VZEROUPPER -mulAvxTwo_7x3Xor_end: +mulAvxGFNI_7x3Xor_end: RET // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -46461,6 +58031,175 @@ mulGFNI_7x4_64_loop: mulGFNI_7x4_64_end: RET +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 @@ -46638,6 +58377,185 @@ mulGFNI_7x4_64Xor_loop: mulGFNI_7x4_64Xor_end: RET +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 @@ -47368,6 +59286,199 @@ mulGFNI_7x5_64_loop: mulGFNI_7x5_64_end: RET +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 @@ -47563,6 +59674,211 @@ mulGFNI_7x5_64Xor_loop: mulGFNI_7x5_64Xor_end: RET +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 @@ -48394,6 +60710,227 @@ mulGFNI_7x6_64_loop: mulGFNI_7x6_64_end: RET +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + // func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 @@ -48611,6 +61148,241 @@ mulGFNI_7x6_64Xor_loop: mulGFNI_7x6_64Xor_end: RET +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6Xor_loop + VZEROUPPER + +mulAvxGFNI_7x6Xor_end: + RET + // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 @@ -49503,6 +62275,232 @@ mulGFNI_7x7_64_loop: mulGFNI_7x7_64_end: RET +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 @@ -49726,6 +62724,255 @@ mulGFNI_7x7_64Xor_loop: mulGFNI_7x7_64Xor_end: RET +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 @@ -50699,6 +63946,254 @@ mulGFNI_7x8_64_loop: mulGFNI_7x8_64_end: RET +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 @@ -50939,6 +64434,280 @@ mulGFNI_7x8_64Xor_loop: mulGFNI_7x8_64Xor_end: RET +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 @@ -52002,6 +65771,276 @@ mulGFNI_7x9_64_loop: mulGFNI_7x9_64_end: RET +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 @@ -52259,6 +66298,305 @@ mulGFNI_7x9_64Xor_loop: mulGFNI_7x9_64Xor_end: RET +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 @@ -53412,6 +67750,298 @@ mulGFNI_7x10_64_loop: mulGFNI_7x10_64_end: RET +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 @@ -53686,6 +68316,330 @@ mulGFNI_7x10_64Xor_loop: mulGFNI_7x10_64Xor_end: RET +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 @@ -54171,155 +69125,6 @@ mulAvxTwo_7x10Xor_loop: mulAvxTwo_7x10Xor_end: RET -// func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 - - // Add start offset to output - ADDQ R13, R12 - - // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_8x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x1_loop - VZEROUPPER - -mulAvxTwo_8x1_end: - RET - // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 @@ -54633,6 +69438,112 @@ mulGFNI_8x1_64_loop: mulGFNI_8x1_64_end: RET +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + // func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 @@ -54743,154 +69654,114 @@ mulGFNI_8x1_64Xor_loop: mulGFNI_8x1_64Xor_end: RET -// func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 20 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R12 - MOVQ start+72(FP), R13 + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 // Add start offset to output - ADDQ R13, R12 + ADDQ R12, R11 // Add start offset to input - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, DX - MOVQ $0x0000000f, R13 - MOVQ R13, X1 - VPBROADCASTB X1, Y1 + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y8 -mulAvxTwo_8x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R12), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 // Store 1 outputs - VMOVDQU Y0, (R12) - ADDQ $0x20, R12 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x1Xor_loop + JNZ mulAvxGFNI_8x1Xor_loop VZEROUPPER -mulAvxTwo_8x1Xor_end: +mulAvxGFNI_8x1Xor_end: RET // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -55104,199 +69975,6 @@ mulAvxTwo_8x1_64Xor_loop: mulAvxTwo_8x1_64Xor_end: RET -// func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - ADDQ R14, R12 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_8x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x2_loop - VZEROUPPER - -mulAvxTwo_8x2_end: - RET - // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 @@ -55706,6 +70384,139 @@ mulGFNI_8x2_64_loop: mulGFNI_8x2_64_end: RET +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 @@ -55845,199 +70656,143 @@ mulGFNI_8x2_64Xor_loop: mulGFNI_8x2_64Xor_end: RET -// func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers // Destination kept in GP registers - // Full registers estimated 39 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R12 - MOVQ start+72(FP), R14 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 // Add start offset to output ADDQ R14, R13 ADDQ R14, R12 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X2 - VPBROADCASTB X2, Y2 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 -mulAvxTwo_8x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R12), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R13) + VMOVDQU Y12, (R13) ADDQ $0x20, R13 - VMOVDQU Y1, (R12) + VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x2Xor_loop + JNZ mulAvxGFNI_8x2Xor_loop VZEROUPPER -mulAvxTwo_8x2Xor_end: +mulAvxGFNI_8x2Xor_end: RET // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -56322,243 +71077,6 @@ mulAvxTwo_8x2_64Xor_loop: mulAvxTwo_8x2_64Xor_end: RET -// func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_8x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - VMOVDQU Y2, (R12) - ADDQ $0x20, R12 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_8x3_loop - VZEROUPPER - -mulAvxTwo_8x3_end: - RET - // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 @@ -57064,6 +71582,166 @@ mulGFNI_8x3_64_loop: mulGFNI_8x3_64_end: RET +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 @@ -57232,31 +71910,43 @@ mulGFNI_8x3_64Xor_loop: mulGFNI_8x3_64Xor_end: RET -// func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers // Destination kept in GP registers - // Full registers estimated 56 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_8x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), DX - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R13 @@ -57264,212 +71954,128 @@ TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 ADDQ R15, R12 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X3 - VPBROADCASTB X3, Y3 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 -mulAvxTwo_8x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R12), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R13) + VMOVDQU Y11, (R13) ADDQ $0x20, R13 - VMOVDQU Y1, (R14) + VMOVDQU Y12, (R14) ADDQ $0x20, R14 - VMOVDQU Y2, (R12) + VMOVDQU Y13, (R12) ADDQ $0x20, R12 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_8x3Xor_loop + JNZ mulAvxGFNI_8x3Xor_loop VZEROUPPER -mulAvxTwo_8x3Xor_end: +mulAvxGFNI_8x3Xor_end: RET // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -58287,6 +72893,193 @@ mulGFNI_8x4_64_loop: mulGFNI_8x4_64_end: RET +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 @@ -58478,6 +73271,203 @@ mulGFNI_8x4_64Xor_loop: mulGFNI_8x4_64Xor_end: RET +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 @@ -59293,6 +74283,224 @@ mulGFNI_8x5_64_loop: mulGFNI_8x5_64_end: RET +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 @@ -59508,6 +74716,236 @@ mulGFNI_8x5_64Xor_loop: mulGFNI_8x5_64Xor_end: RET +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 @@ -60400,6 +75838,234 @@ mulGFNI_8x6_64_loop: mulGFNI_8x6_64_end: RET +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 @@ -60624,6 +76290,254 @@ mulGFNI_8x6_64Xor_loop: mulGFNI_8x6_64Xor_end: RET +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 @@ -61610,6 +77524,259 @@ mulGFNI_8x7_64_loop: mulGFNI_8x7_64_end: RET +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 @@ -61853,6 +78020,282 @@ mulGFNI_8x7_64Xor_loop: mulGFNI_8x7_64Xor_end: RET +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 @@ -62941,6 +79384,284 @@ mulGFNI_8x8_64_loop: mulGFNI_8x8_64_end: RET +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 @@ -63203,6 +79924,310 @@ mulGFNI_8x8_64Xor_loop: mulGFNI_8x8_64Xor_end: RET +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 @@ -64393,6 +81418,309 @@ mulGFNI_8x9_64_loop: mulGFNI_8x9_64_end: RET +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 @@ -64674,6 +82002,338 @@ mulGFNI_8x9_64Xor_loop: mulGFNI_8x9_64Xor_end: RET +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 @@ -65966,6 +83626,334 @@ mulGFNI_8x10_64_loop: mulGFNI_8x10_64_end: RET +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 @@ -66266,6 +84254,366 @@ mulGFNI_8x10_64Xor_loop: mulGFNI_8x10_64Xor_end: RET +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 @@ -66810,169 +85158,6 @@ mulAvxTwo_8x10Xor_loop: mulAvxTwo_8x10Xor_end: RET -// func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 - - // Add start offset to output - ADDQ R14, R13 - - // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_9x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x1_loop - VZEROUPPER - -mulAvxTwo_9x1_end: - RET - // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 @@ -67316,6 +85501,121 @@ mulGFNI_9x1_64_loop: mulGFNI_9x1_64_end: RET +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 @@ -67435,168 +85735,123 @@ mulGFNI_9x1_64Xor_loop: mulGFNI_9x1_64Xor_end: RET -// func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R13 - MOVQ start+72(FP), R14 + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 // Add start offset to output - ADDQ R14, R13 + ADDQ R13, R12 // Add start offset to input - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, DX - MOVQ $0x0000000f, R14 - MOVQ R14, X1 - VPBROADCASTB X1, Y1 + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 -mulAvxTwo_9x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R13), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 // Store 1 outputs - VMOVDQU Y0, (R13) - ADDQ $0x20, R13 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x1Xor_loop + JNZ mulAvxGFNI_9x1Xor_loop VZEROUPPER -mulAvxTwo_9x1Xor_end: +mulAvxGFNI_9x1Xor_end: RET // func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -67831,218 +86086,6 @@ mulAvxTwo_9x1_64Xor_loop: mulAvxTwo_9x1_64Xor_end: RET -// func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - ADDQ R15, R13 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_9x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x2_loop - VZEROUPPER - -mulAvxTwo_9x2_end: - RET - // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 @@ -68493,6 +86536,151 @@ mulGFNI_9x2_64_loop: mulGFNI_9x2_64_end: RET +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + // func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 @@ -68644,218 +86832,155 @@ mulGFNI_9x2_64Xor_loop: mulGFNI_9x2_64Xor_end: RET -// func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers // Destination kept in GP registers - // Full registers estimated 43 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R13 - MOVQ start+72(FP), R15 + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 // Add start offset to output ADDQ R15, R14 ADDQ R15, R13 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X2 - VPBROADCASTB X2, Y2 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_9x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R13), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y12, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x2Xor_loop + JNZ mulAvxGFNI_9x2Xor_loop VZEROUPPER -mulAvxTwo_9x2Xor_end: +mulAvxGFNI_9x2Xor_end: RET // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -69169,267 +87294,6 @@ mulAvxTwo_9x2_64Xor_loop: mulAvxTwo_9x2_64Xor_end: RET -// func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - -mulAvxTwo_9x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_9x3_loop - VZEROUPPER - -mulAvxTwo_9x3_end: - RET - // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 @@ -69987,6 +87851,181 @@ mulGFNI_9x3_64_loop: mulGFNI_9x3_64_end: RET +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + // func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 @@ -70170,32 +88209,44 @@ mulGFNI_9x3_64Xor_loop: mulGFNI_9x3_64Xor_end: RET -// func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers // Destination kept in GP registers - // Full registers estimated 62 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_9x3Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), DX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -70203,235 +88254,142 @@ TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 ADDQ BP, R13 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_9x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y11, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y2, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_9x3Xor_loop + JNZ mulAvxGFNI_9x3Xor_loop VZEROUPPER -mulAvxTwo_9x3Xor_end: +mulAvxGFNI_9x3Xor_end: RET // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -71335,6 +89293,215 @@ mulGFNI_9x4_64_loop: mulGFNI_9x4_64_end: RET +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 @@ -71544,6 +89711,225 @@ mulGFNI_9x4_64Xor_loop: mulGFNI_9x4_64Xor_end: RET +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 @@ -72412,6 +90798,230 @@ mulGFNI_9x5_64_loop: mulGFNI_9x5_64_end: RET +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 @@ -72633,6 +91243,247 @@ mulGFNI_9x5_64Xor_loop: mulGFNI_9x5_64Xor_end: RET +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 @@ -73608,6 +92459,258 @@ mulGFNI_9x6_64_loop: mulGFNI_9x6_64_end: RET +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 @@ -73850,6 +92953,278 @@ mulGFNI_9x6_64Xor_loop: mulGFNI_9x6_64Xor_end: RET +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 @@ -74939,6 +94314,286 @@ mulGFNI_9x7_64_loop: mulGFNI_9x7_64_end: RET +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 @@ -75202,6 +94857,309 @@ mulGFNI_9x7_64Xor_loop: mulGFNI_9x7_64Xor_end: RET +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 @@ -76405,6 +96363,314 @@ mulGFNI_9x8_64_loop: mulGFNI_9x8_64_end: RET +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 @@ -76689,6 +96955,340 @@ mulGFNI_9x8_64Xor_loop: mulGFNI_9x8_64Xor_end: RET +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 @@ -78006,6 +98606,342 @@ mulGFNI_9x9_64_loop: mulGFNI_9x9_64_end: RET +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 @@ -78311,6 +99247,371 @@ mulGFNI_9x9_64Xor_loop: mulGFNI_9x9_64Xor_end: RET +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 @@ -79742,6 +101043,370 @@ mulGFNI_9x10_64_loop: mulGFNI_9x10_64_end: RET +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 @@ -80068,6 +101733,402 @@ mulGFNI_9x10_64Xor_loop: mulGFNI_9x10_64Xor_end: RET +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 @@ -80671,183 +102732,6 @@ mulAvxTwo_9x10Xor_loop: mulAvxTwo_9x10Xor_end: RET -// func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 - - // Add start offset to output - ADDQ R15, R14 - - // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 - -mulAvxTwo_10x1_loop: - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y0 - - // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) - - // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x1_loop - VZEROUPPER - -mulAvxTwo_10x1_end: - RET - // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 @@ -81221,6 +103105,130 @@ mulGFNI_10x1_64_loop: mulGFNI_10x1_64_end: RET +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 @@ -81349,182 +103357,132 @@ mulGFNI_10x1_64Xor_loop: mulGFNI_10x1_64Xor_end: RET -// func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 - // Loading no tables to registers +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers // Destination kept in GP registers - // Full registers estimated 24 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x1Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R14 - MOVQ start+72(FP), R15 + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R15, R14 + ADDQ R14, R13 // Add start offset to input - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, R12 - ADDQ R15, R13 - ADDQ R15, DX - MOVQ $0x0000000f, R15 - MOVQ R15, X1 - VPBROADCASTB X1, Y1 + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 -mulAvxTwo_10x1Xor_loop: // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (BX), Y4 - ADDQ $0x20, BX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y2 - VMOVDQU 32(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 1 to 1 outputs - VMOVDQU (SI), Y4 - ADDQ $0x20, SI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 64(CX), Y2 - VMOVDQU 96(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 2 to 1 outputs - VMOVDQU (DI), Y4 - ADDQ $0x20, DI - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 128(CX), Y2 - VMOVDQU 160(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 3 to 1 outputs - VMOVDQU (R8), Y4 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 192(CX), Y2 - VMOVDQU 224(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 4 to 1 outputs - VMOVDQU (R9), Y4 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 256(CX), Y2 - VMOVDQU 288(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 5 to 1 outputs - VMOVDQU (R10), Y4 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 320(CX), Y2 - VMOVDQU 352(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 6 to 1 outputs - VMOVDQU (R11), Y4 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 384(CX), Y2 - VMOVDQU 416(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 7 to 1 outputs - VMOVDQU (R12), Y4 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 448(CX), Y2 - VMOVDQU 480(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 8 to 1 outputs - VMOVDQU (R13), Y4 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 512(CX), Y2 - VMOVDQU 544(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 // Load and process 32 bytes from input 9 to 1 outputs - VMOVDQU (DX), Y4 - ADDQ $0x20, DX - VPSRLQ $0x04, Y4, Y5 - VPAND Y1, Y4, Y4 - VPAND Y1, Y5, Y5 - VMOVDQU 576(CX), Y2 - VMOVDQU 608(CX), Y3 - VPSHUFB Y4, Y2, Y2 - VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 // Store 1 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_10x1Xor_loop + JNZ mulAvxGFNI_10x1Xor_loop VZEROUPPER -mulAvxTwo_10x1Xor_end: +mulAvxGFNI_10x1Xor_end: RET // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -81780,237 +103738,6 @@ mulAvxTwo_10x1_64Xor_loop: mulAvxTwo_10x1_64Xor_end: RET -// func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R15 - ADDQ BP, R14 - - // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 - -mulAvxTwo_10x2_loop: - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y0 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y1 - - // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) - - // Store 2 outputs - VMOVDQU Y0, (R15) - ADDQ $0x20, R15 - VMOVDQU Y1, (R14) - ADDQ $0x20, R14 - - // Prepare for next loop - DECQ AX - JNZ mulAvxTwo_10x2_loop - VZEROUPPER - -mulAvxTwo_10x2_end: - RET - // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 @@ -82502,6 +104229,163 @@ mulGFNI_10x2_64_loop: mulGFNI_10x2_64_end: RET +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 @@ -82665,237 +104549,167 @@ mulGFNI_10x2_64Xor_loop: mulGFNI_10x2_64Xor_end: RET -// func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers // Destination kept in GP registers - // Full registers estimated 47 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x2Xor_end - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ out_base+48(FP), R14 - MOVQ (R14), R15 - MOVQ 24(R14), R14 - MOVQ start+72(FP), BP + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R15 ADDQ BP, R14 // Add start offset to input - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, R13 - ADDQ BP, DX - MOVQ $0x0000000f, BP - MOVQ BP, X2 - VPBROADCASTB X2, Y2 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 -mulAvxTwo_10x2Xor_loop: // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (BX), Y5 - ADDQ $0x20, BX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU (R15), Y0 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU (R14), Y1 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 2 outputs - VMOVDQU (SI), Y5 - ADDQ $0x20, SI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 128(CX), Y3 - VMOVDQU 160(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 192(CX), Y3 - VMOVDQU 224(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 2 outputs - VMOVDQU (DI), Y5 - ADDQ $0x20, DI - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 256(CX), Y3 - VMOVDQU 288(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 320(CX), Y3 - VMOVDQU 352(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 2 outputs - VMOVDQU (R8), Y5 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 384(CX), Y3 - VMOVDQU 416(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 448(CX), Y3 - VMOVDQU 480(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 2 outputs - VMOVDQU (R9), Y5 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 512(CX), Y3 - VMOVDQU 544(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 576(CX), Y3 - VMOVDQU 608(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 2 outputs - VMOVDQU (R10), Y5 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 640(CX), Y3 - VMOVDQU 672(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 704(CX), Y3 - VMOVDQU 736(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 2 outputs - VMOVDQU (R11), Y5 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 768(CX), Y3 - VMOVDQU 800(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 832(CX), Y3 - VMOVDQU 864(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 2 outputs - VMOVDQU (R12), Y5 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 896(CX), Y3 - VMOVDQU 928(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 960(CX), Y3 - VMOVDQU 992(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 2 outputs - VMOVDQU (R13), Y5 - ADDQ $0x20, R13 - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1024(CX), Y3 - VMOVDQU 1056(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1088(CX), Y3 - VMOVDQU 1120(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 2 outputs - VMOVDQU (DX), Y5 - ADDQ $0x20, DX - VPSRLQ $0x04, Y5, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y6, Y6 - VMOVDQU 1152(CX), Y3 - VMOVDQU 1184(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - VMOVDQU 1216(CX), Y3 - VMOVDQU 1248(CX), Y4 - VPSHUFB Y5, Y3, Y3 - VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 2 outputs - VMOVDQU Y0, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y1, (R14) + VMOVDQU Y13, (R14) ADDQ $0x20, R14 // Prepare for next loop DECQ AX - JNZ mulAvxTwo_10x2Xor_loop + JNZ mulAvxGFNI_10x2Xor_loop VZEROUPPER -mulAvxTwo_10x2Xor_end: +mulAvxGFNI_10x2Xor_end: RET // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -83238,293 +105052,6 @@ mulAvxTwo_10x2_64Xor_loop: mulAvxTwo_10x2_64Xor_end: RET -// func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 - // Loading no tables to registers - // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP - - // Add start offset to output - ADDQ BP, R14 - ADDQ BP, R15 - ADDQ BP, R13 - - // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP - -mulAvxTwo_10x3_loop: - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y0 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y1 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y2 - - // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) - - // Store 3 outputs - VMOVDQU Y0, (R14) - ADDQ $0x20, R14 - VMOVDQU Y1, (R15) - ADDQ $0x20, R15 - VMOVDQU Y2, (R13) - ADDQ $0x20, R13 - - // Prepare for next loop - DECQ BP - JNZ mulAvxTwo_10x3_loop - VZEROUPPER - -mulAvxTwo_10x3_end: - RET - // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 @@ -84139,6 +105666,200 @@ mulGFNI_10x3_64_loop: mulGFNI_10x3_64_end: RET +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + // func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 @@ -84338,33 +106059,45 @@ mulGFNI_10x3_64Xor_loop: mulGFNI_10x3_64Xor_end: RET -// func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 - // Loading no tables to registers +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers // Destination kept in GP registers - // Full registers estimated 68 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x05, AX - TESTQ AX, AX - JZ mulAvxTwo_10x3Xor_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -84372,260 +106105,160 @@ TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 ADDQ BP, R13 // Add start offset to input - ADDQ BP, DX - ADDQ BP, BX - ADDQ BP, SI - ADDQ BP, DI - ADDQ BP, R8 - ADDQ BP, R9 - ADDQ BP, R10 - ADDQ BP, R11 - ADDQ BP, R12 - ADDQ BP, AX - MOVQ $0x0000000f, BP - MOVQ BP, X3 - VPBROADCASTB X3, Y3 - MOVQ n+80(FP), BP - SHRQ $0x05, BP + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 -mulAvxTwo_10x3Xor_loop: // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y6 - ADDQ $0x20, DX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU (R14), Y0 - VMOVDQU (CX), Y4 - VMOVDQU 32(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU (R15), Y1 - VMOVDQU 64(CX), Y4 - VMOVDQU 96(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU (R13), Y2 - VMOVDQU 128(CX), Y4 - VMOVDQU 160(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y6 - ADDQ $0x20, BX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 192(CX), Y4 - VMOVDQU 224(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 256(CX), Y4 - VMOVDQU 288(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 320(CX), Y4 - VMOVDQU 352(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y6 - ADDQ $0x20, SI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 384(CX), Y4 - VMOVDQU 416(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 448(CX), Y4 - VMOVDQU 480(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 512(CX), Y4 - VMOVDQU 544(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y6 - ADDQ $0x20, DI - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 576(CX), Y4 - VMOVDQU 608(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 640(CX), Y4 - VMOVDQU 672(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 704(CX), Y4 - VMOVDQU 736(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y6 - ADDQ $0x20, R8 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 768(CX), Y4 - VMOVDQU 800(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 832(CX), Y4 - VMOVDQU 864(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 896(CX), Y4 - VMOVDQU 928(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y6 - ADDQ $0x20, R9 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 960(CX), Y4 - VMOVDQU 992(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1024(CX), Y4 - VMOVDQU 1056(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1088(CX), Y4 - VMOVDQU 1120(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y6 - ADDQ $0x20, R10 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1152(CX), Y4 - VMOVDQU 1184(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1216(CX), Y4 - VMOVDQU 1248(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1280(CX), Y4 - VMOVDQU 1312(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y6 - ADDQ $0x20, R11 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1344(CX), Y4 - VMOVDQU 1376(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1408(CX), Y4 - VMOVDQU 1440(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1472(CX), Y4 - VMOVDQU 1504(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 8 to 3 outputs - VMOVDQU (R12), Y6 - ADDQ $0x20, R12 - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1536(CX), Y4 - VMOVDQU 1568(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1600(CX), Y4 - VMOVDQU 1632(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1664(CX), Y4 - VMOVDQU 1696(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Load and process 32 bytes from input 9 to 3 outputs - VMOVDQU (AX), Y6 - ADDQ $0x20, AX - VPSRLQ $0x04, Y6, Y7 - VPAND Y3, Y6, Y6 - VPAND Y3, Y7, Y7 - VMOVDQU 1728(CX), Y4 - VMOVDQU 1760(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) - VMOVDQU 1792(CX), Y4 - VMOVDQU 1824(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) - VMOVDQU 1856(CX), Y4 - VMOVDQU 1888(CX), Y5 - VPSHUFB Y6, Y4, Y4 - VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU Y0, (R14) + VMOVDQU Y11, (R14) ADDQ $0x20, R14 - VMOVDQU Y1, (R15) + VMOVDQU Y12, (R15) ADDQ $0x20, R15 - VMOVDQU Y2, (R13) + VMOVDQU Y13, (R13) ADDQ $0x20, R13 // Prepare for next loop DECQ BP - JNZ mulAvxTwo_10x3Xor_loop + JNZ mulAvxGFNI_10x3Xor_loop VZEROUPPER -mulAvxTwo_10x3Xor_end: +mulAvxGFNI_10x3Xor_end: RET // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -85589,6 +107222,220 @@ mulGFNI_10x4_64_loop: mulGFNI_10x4_64_end: RET +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 @@ -85803,6 +107650,234 @@ mulGFNI_10x4_64Xor_loop: mulGFNI_10x4_64Xor_end: RET +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 @@ -86743,6 +108818,251 @@ mulGFNI_10x5_64_loop: mulGFNI_10x5_64_end: RET +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 @@ -86980,6 +109300,268 @@ mulGFNI_10x5_64Xor_loop: mulGFNI_10x5_64Xor_end: RET +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 @@ -88046,6 +110628,282 @@ mulGFNI_10x6_64_loop: mulGFNI_10x6_64_end: RET +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 @@ -88306,6 +111164,302 @@ mulGFNI_10x6_64Xor_loop: mulGFNI_10x6_64Xor_end: RET +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 @@ -89498,6 +112652,313 @@ mulGFNI_10x7_64_loop: mulGFNI_10x7_64_end: RET +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 @@ -89781,6 +113242,336 @@ mulGFNI_10x7_64Xor_loop: mulGFNI_10x7_64Xor_end: RET +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 @@ -91099,6 +114890,344 @@ mulGFNI_10x8_64_loop: mulGFNI_10x8_64_end: RET +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 @@ -91405,6 +115534,370 @@ mulGFNI_10x8_64Xor_loop: mulGFNI_10x8_64Xor_end: RET +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 @@ -92849,6 +117342,375 @@ mulGFNI_10x9_64_loop: mulGFNI_10x9_64_end: RET +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 @@ -93178,6 +118040,404 @@ mulGFNI_10x9_64Xor_loop: mulGFNI_10x9_64Xor_end: RET +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 @@ -94748,6 +120008,406 @@ mulGFNI_10x10_64_loop: mulGFNI_10x10_64_end: RET +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 @@ -95100,6 +120760,438 @@ mulGFNI_10x10_64Xor_loop: mulGFNI_10x10_64Xor_end: RET +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 diff --git a/galois_gen_none.go b/galois_gen_none.go index 02c3cf49..1bb268a3 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -23,3 +23,11 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { panic("codegen not available") } + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + panic("codegen not available") +} diff --git a/galois_gen_nopshufb_amd64.go b/galois_gen_nopshufb_amd64.go index b07f3f34..298bf504 100644 --- a/galois_gen_nopshufb_amd64.go +++ b/galois_gen_nopshufb_amd64.go @@ -21,1100 +21,2200 @@ func avx2XorSlice_64(in []byte, out []byte) //go:noescape func mulGFNI_1x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x1 takes 1 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x1_64Xor takes 1 inputs and produces 1 outputs. // //go:noescape func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x1Xor takes 1 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x2 takes 1 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x2_64Xor takes 1 inputs and produces 2 outputs. // //go:noescape func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x2Xor takes 1 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x3 takes 1 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x3_64Xor takes 1 inputs and produces 3 outputs. // //go:noescape func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x3Xor takes 1 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4 takes 1 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x4_64Xor takes 1 inputs and produces 4 outputs. // //go:noescape func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x4Xor takes 1 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5 takes 1 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x5_64Xor takes 1 inputs and produces 5 outputs. // //go:noescape func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x5Xor takes 1 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6 takes 1 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x6_64Xor takes 1 inputs and produces 6 outputs. // //go:noescape func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x6Xor takes 1 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7 takes 1 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x7_64Xor takes 1 inputs and produces 7 outputs. // //go:noescape func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x7Xor takes 1 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8 takes 1 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x8_64Xor takes 1 inputs and produces 8 outputs. // //go:noescape func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x8Xor takes 1 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9 takes 1 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x9_64Xor takes 1 inputs and produces 9 outputs. // //go:noescape func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x9Xor takes 1 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x10 takes 1 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_1x10_64Xor takes 1 inputs and produces 10 outputs. // //go:noescape func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_1x10Xor takes 1 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x1 takes 2 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x1_64Xor takes 2 inputs and produces 1 outputs. // //go:noescape func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x1Xor takes 2 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x2 takes 2 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x2_64Xor takes 2 inputs and produces 2 outputs. // //go:noescape func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x2Xor takes 2 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x3 takes 2 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x3_64Xor takes 2 inputs and produces 3 outputs. // //go:noescape func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x3Xor takes 2 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4 takes 2 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x4_64Xor takes 2 inputs and produces 4 outputs. // //go:noescape func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x4Xor takes 2 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5 takes 2 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x5_64Xor takes 2 inputs and produces 5 outputs. // //go:noescape func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x5Xor takes 2 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6 takes 2 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x6_64Xor takes 2 inputs and produces 6 outputs. // //go:noescape func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x6Xor takes 2 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7 takes 2 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x7_64Xor takes 2 inputs and produces 7 outputs. // //go:noescape func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x7Xor takes 2 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8 takes 2 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x8_64Xor takes 2 inputs and produces 8 outputs. // //go:noescape func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x8Xor takes 2 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9 takes 2 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x9_64Xor takes 2 inputs and produces 9 outputs. // //go:noescape func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x9Xor takes 2 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x10 takes 2 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_2x10_64Xor takes 2 inputs and produces 10 outputs. // //go:noescape func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_2x10Xor takes 2 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x1 takes 3 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x1_64Xor takes 3 inputs and produces 1 outputs. // //go:noescape func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x1Xor takes 3 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x2 takes 3 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x2_64Xor takes 3 inputs and produces 2 outputs. // //go:noescape func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x2Xor takes 3 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x3 takes 3 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x3_64Xor takes 3 inputs and produces 3 outputs. // //go:noescape func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x3Xor takes 3 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4 takes 3 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x4_64Xor takes 3 inputs and produces 4 outputs. // //go:noescape func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x4Xor takes 3 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5 takes 3 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x5_64Xor takes 3 inputs and produces 5 outputs. // //go:noescape func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x5Xor takes 3 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6 takes 3 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x6_64Xor takes 3 inputs and produces 6 outputs. // //go:noescape func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x6Xor takes 3 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7 takes 3 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x7_64Xor takes 3 inputs and produces 7 outputs. // //go:noescape func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x7Xor takes 3 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8 takes 3 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x8_64Xor takes 3 inputs and produces 8 outputs. // //go:noescape func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x8Xor takes 3 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9 takes 3 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x9_64Xor takes 3 inputs and produces 9 outputs. // //go:noescape func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x9Xor takes 3 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x10 takes 3 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_3x10_64Xor takes 3 inputs and produces 10 outputs. // //go:noescape func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_3x10Xor takes 3 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x1 takes 4 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x1_64Xor takes 4 inputs and produces 1 outputs. // //go:noescape func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x1Xor takes 4 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x2 takes 4 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x2_64Xor takes 4 inputs and produces 2 outputs. // //go:noescape func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x2Xor takes 4 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x3 takes 4 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x3_64Xor takes 4 inputs and produces 3 outputs. // //go:noescape func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x3Xor takes 4 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4 takes 4 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x4_64Xor takes 4 inputs and produces 4 outputs. // //go:noescape func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x4Xor takes 4 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5 takes 4 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x5_64Xor takes 4 inputs and produces 5 outputs. // //go:noescape func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x5Xor takes 4 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x6 takes 4 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x6_64Xor takes 4 inputs and produces 6 outputs. // //go:noescape func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x6Xor takes 4 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7 takes 4 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x7_64Xor takes 4 inputs and produces 7 outputs. // //go:noescape func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x7Xor takes 4 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8 takes 4 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x8_64Xor takes 4 inputs and produces 8 outputs. // //go:noescape func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x8Xor takes 4 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9 takes 4 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x9_64Xor takes 4 inputs and produces 9 outputs. // //go:noescape func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x9Xor takes 4 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x10 takes 4 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_4x10_64Xor takes 4 inputs and produces 10 outputs. // //go:noescape func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_4x10Xor takes 4 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x1 takes 5 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x1_64Xor takes 5 inputs and produces 1 outputs. // //go:noescape func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x1Xor takes 5 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x2 takes 5 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x2_64Xor takes 5 inputs and produces 2 outputs. // //go:noescape func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x2Xor takes 5 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x3 takes 5 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x3_64Xor takes 5 inputs and produces 3 outputs. // //go:noescape func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x3Xor takes 5 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4 takes 5 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x4_64Xor takes 5 inputs and produces 4 outputs. // //go:noescape func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x4Xor takes 5 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5 takes 5 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x5_64Xor takes 5 inputs and produces 5 outputs. // //go:noescape func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x5Xor takes 5 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6 takes 5 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x6_64Xor takes 5 inputs and produces 6 outputs. // //go:noescape func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x6Xor takes 5 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7 takes 5 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x7_64Xor takes 5 inputs and produces 7 outputs. // //go:noescape func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x7Xor takes 5 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8 takes 5 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x8_64Xor takes 5 inputs and produces 8 outputs. // //go:noescape func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x8Xor takes 5 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9 takes 5 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x9_64Xor takes 5 inputs and produces 9 outputs. // //go:noescape func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x9Xor takes 5 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x10 takes 5 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_5x10_64Xor takes 5 inputs and produces 10 outputs. // //go:noescape func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_5x10Xor takes 5 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x1 takes 6 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x1_64Xor takes 6 inputs and produces 1 outputs. // //go:noescape func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x1Xor takes 6 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x2 takes 6 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x2_64Xor takes 6 inputs and produces 2 outputs. // //go:noescape func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x2Xor takes 6 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x3 takes 6 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x3_64Xor takes 6 inputs and produces 3 outputs. // //go:noescape func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x3Xor takes 6 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4 takes 6 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x4_64Xor takes 6 inputs and produces 4 outputs. // //go:noescape func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x4Xor takes 6 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5 takes 6 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x5_64Xor takes 6 inputs and produces 5 outputs. // //go:noescape func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x5Xor takes 6 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6 takes 6 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x6_64Xor takes 6 inputs and produces 6 outputs. // //go:noescape func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x6Xor takes 6 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7 takes 6 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x7_64Xor takes 6 inputs and produces 7 outputs. // //go:noescape func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x7Xor takes 6 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8 takes 6 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x8_64Xor takes 6 inputs and produces 8 outputs. // //go:noescape func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x8Xor takes 6 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9 takes 6 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x9_64Xor takes 6 inputs and produces 9 outputs. // //go:noescape func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x9Xor takes 6 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x10 takes 6 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_6x10_64Xor takes 6 inputs and produces 10 outputs. // //go:noescape func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_6x10Xor takes 6 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x1 takes 7 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x1_64Xor takes 7 inputs and produces 1 outputs. // //go:noescape func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x1Xor takes 7 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x2 takes 7 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x2_64Xor takes 7 inputs and produces 2 outputs. // //go:noescape func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x2Xor takes 7 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x3 takes 7 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x3_64Xor takes 7 inputs and produces 3 outputs. // //go:noescape func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x3Xor takes 7 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x4_64 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape -func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + +// mulAvxGFNI_7x4 takes 7 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // mulGFNI_7x4_64Xor takes 7 inputs and produces 4 outputs. // //go:noescape func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x4Xor takes 7 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5 takes 7 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x5_64Xor takes 7 inputs and produces 5 outputs. // //go:noescape func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x5Xor takes 7 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6 takes 7 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x6_64Xor takes 7 inputs and produces 6 outputs. // //go:noescape func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x6Xor takes 7 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7 takes 7 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x7_64Xor takes 7 inputs and produces 7 outputs. // //go:noescape func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x7Xor takes 7 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8 takes 7 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x8_64Xor takes 7 inputs and produces 8 outputs. // //go:noescape func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x8Xor takes 7 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9 takes 7 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x9_64Xor takes 7 inputs and produces 9 outputs. // //go:noescape func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x9Xor takes 7 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x10 takes 7 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_7x10_64Xor takes 7 inputs and produces 10 outputs. // //go:noescape func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_7x10Xor takes 7 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x1 takes 8 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x1_64Xor takes 8 inputs and produces 1 outputs. // //go:noescape func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x1Xor takes 8 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x2 takes 8 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x2_64Xor takes 8 inputs and produces 2 outputs. // //go:noescape func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x2Xor takes 8 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x3 takes 8 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x3_64Xor takes 8 inputs and produces 3 outputs. // //go:noescape func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x3Xor takes 8 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4 takes 8 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x4_64Xor takes 8 inputs and produces 4 outputs. // //go:noescape func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x4Xor takes 8 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5 takes 8 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x5_64Xor takes 8 inputs and produces 5 outputs. // //go:noescape func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x5Xor takes 8 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6 takes 8 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x6_64Xor takes 8 inputs and produces 6 outputs. // //go:noescape func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x6Xor takes 8 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7 takes 8 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x7_64Xor takes 8 inputs and produces 7 outputs. // //go:noescape func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x7Xor takes 8 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8 takes 8 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x8_64Xor takes 8 inputs and produces 8 outputs. // //go:noescape func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x8Xor takes 8 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9 takes 8 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x9_64Xor takes 8 inputs and produces 9 outputs. // //go:noescape func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x9Xor takes 8 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x10 takes 8 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_8x10_64Xor takes 8 inputs and produces 10 outputs. // //go:noescape func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_8x10Xor takes 8 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x1 takes 9 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x1_64Xor takes 9 inputs and produces 1 outputs. // //go:noescape func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x1Xor takes 9 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x2 takes 9 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x2_64Xor takes 9 inputs and produces 2 outputs. // //go:noescape func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x2Xor takes 9 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x3 takes 9 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x3_64Xor takes 9 inputs and produces 3 outputs. // //go:noescape func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x3Xor takes 9 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4 takes 9 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x4_64Xor takes 9 inputs and produces 4 outputs. // //go:noescape func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x4Xor takes 9 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5 takes 9 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x5_64Xor takes 9 inputs and produces 5 outputs. // //go:noescape func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x5Xor takes 9 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6 takes 9 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x6_64Xor takes 9 inputs and produces 6 outputs. // //go:noescape func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x6Xor takes 9 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7 takes 9 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x7_64Xor takes 9 inputs and produces 7 outputs. // //go:noescape func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x7Xor takes 9 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8 takes 9 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x8_64Xor takes 9 inputs and produces 8 outputs. // //go:noescape func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x8Xor takes 9 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9 takes 9 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x9_64Xor takes 9 inputs and produces 9 outputs. // //go:noescape func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x9Xor takes 9 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x10 takes 9 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_9x10_64Xor takes 9 inputs and produces 10 outputs. // //go:noescape func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_9x10Xor takes 9 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x1 takes 10 inputs and produces 1 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x1_64Xor takes 10 inputs and produces 1 outputs. // //go:noescape func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x1Xor takes 10 inputs and produces 1 outputs. +// +//go:noescape +func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x2 takes 10 inputs and produces 2 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x2_64Xor takes 10 inputs and produces 2 outputs. // //go:noescape func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x2Xor takes 10 inputs and produces 2 outputs. +// +//go:noescape +func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x3 takes 10 inputs and produces 3 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x3_64Xor takes 10 inputs and produces 3 outputs. // //go:noescape func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x3Xor takes 10 inputs and produces 3 outputs. +// +//go:noescape +func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4 takes 10 inputs and produces 4 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x4_64Xor takes 10 inputs and produces 4 outputs. // //go:noescape func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x4Xor takes 10 inputs and produces 4 outputs. +// +//go:noescape +func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5 takes 10 inputs and produces 5 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x5_64Xor takes 10 inputs and produces 5 outputs. // //go:noescape func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x5Xor takes 10 inputs and produces 5 outputs. +// +//go:noescape +func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6 takes 10 inputs and produces 6 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x6_64Xor takes 10 inputs and produces 6 outputs. // //go:noescape func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x6Xor takes 10 inputs and produces 6 outputs. +// +//go:noescape +func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7 takes 10 inputs and produces 7 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x7_64Xor takes 10 inputs and produces 7 outputs. // //go:noescape func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x7Xor takes 10 inputs and produces 7 outputs. +// +//go:noescape +func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8 takes 10 inputs and produces 8 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x8_64Xor takes 10 inputs and produces 8 outputs. // //go:noescape func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x8Xor takes 10 inputs and produces 8 outputs. +// +//go:noescape +func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9 takes 10 inputs and produces 9 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x9_64Xor takes 10 inputs and produces 9 outputs. // //go:noescape func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x9Xor takes 10 inputs and produces 9 outputs. +// +//go:noescape +func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. // //go:noescape func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10 takes 10 inputs and produces 10 outputs. +// The output is initialized to 0. +// +//go:noescape +func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + // mulGFNI_10x10_64Xor takes 10 inputs and produces 10 outputs. // //go:noescape func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// mulAvxGFNI_10x10Xor takes 10 inputs and produces 10 outputs. +// +//go:noescape +func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) + //go:noescape func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) diff --git a/galois_gen_nopshufb_amd64.s b/galois_gen_nopshufb_amd64.s index 574dfe9b..5782759c 100644 --- a/galois_gen_nopshufb_amd64.s +++ b/galois_gen_nopshufb_amd64.s @@ -153,6 +153,49 @@ mulGFNI_1x1_64_loop: mulGFNI_1x1_64_end: RET +// func mulAvxGFNI_1x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y1 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1_loop + VZEROUPPER + +mulAvxGFNI_1x1_end: + RET + // func mulGFNI_1x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x1_64Xor(SB), $0-88 @@ -200,6 +243,53 @@ mulGFNI_1x1_64Xor_loop: mulGFNI_1x1_64Xor_end: RET +// func mulAvxGFNI_1x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 4 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x1Xor_end + VBROADCASTSD (CX), Y0 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + +mulAvxGFNI_1x1Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y1 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y2 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y2, Y2 + VXORPD Y1, Y2, Y1 + + // Store 1 outputs + VMOVDQU Y1, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x1Xor_loop + VZEROUPPER + +mulAvxGFNI_1x1Xor_end: + RET + // func mulGFNI_1x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64(SB), $0-88 @@ -249,6 +339,55 @@ mulGFNI_1x2_64_loop: mulGFNI_1x2_64_end: RET +// func mulAvxGFNI_1x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2_loop + VZEROUPPER + +mulAvxGFNI_1x2_end: + RET + // func mulGFNI_1x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x2_64Xor(SB), $0-88 @@ -304,6 +443,61 @@ mulGFNI_1x2_64Xor_loop: mulGFNI_1x2_64Xor_end: RET +// func mulAvxGFNI_1x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + +mulAvxGFNI_1x2Xor_loop: + // Load 2 outputs + VMOVDQU (BX), Y2 + VMOVDQU (DX), Y3 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y5 + VXORPD Y2, Y5, Y2 + VGF2P8AFFINEQB $0x00, Y1, Y4, Y5 + VXORPD Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + VMOVDQU Y3, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x2Xor_loop + VZEROUPPER + +mulAvxGFNI_1x2Xor_end: + RET + // func mulGFNI_1x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64(SB), $0-88 @@ -359,6 +553,61 @@ mulGFNI_1x3_64_loop: mulGFNI_1x3_64_end: RET +// func mulAvxGFNI_1x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y5, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3_loop + VZEROUPPER + +mulAvxGFNI_1x3_end: + RET + // func mulGFNI_1x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x3_64Xor(SB), $0-88 @@ -422,6 +671,69 @@ mulGFNI_1x3_64Xor_loop: mulGFNI_1x3_64Xor_end: RET +// func mulAvxGFNI_1x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + +mulAvxGFNI_1x3Xor_loop: + // Load 3 outputs + VMOVDQU (BX), Y3 + VMOVDQU (SI), Y4 + VMOVDQU (DX), Y5 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y3, Y7, Y3 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x3Xor_loop + VZEROUPPER + +mulAvxGFNI_1x3Xor_end: + RET + // func mulGFNI_1x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64(SB), $0-88 @@ -483,6 +795,67 @@ mulGFNI_1x4_64_loop: mulGFNI_1x4_64_end: RET +// func mulAvxGFNI_1x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y7, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y7, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4_loop + VZEROUPPER + +mulAvxGFNI_1x4_end: + RET + // func mulGFNI_1x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x4_64Xor(SB), $0-88 @@ -554,6 +927,77 @@ mulGFNI_1x4_64Xor_loop: mulGFNI_1x4_64Xor_end: RET +// func mulAvxGFNI_1x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, DX + + // Add start offset to input + ADDQ R8, CX + +mulAvxGFNI_1x4Xor_loop: + // Load 4 outputs + VMOVDQU (BX), Y4 + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (DX), Y7 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y4, Y9, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y5, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 4 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x4Xor_loop + VZEROUPPER + +mulAvxGFNI_1x4Xor_end: + RET + // func mulGFNI_1x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64(SB), $0-88 @@ -621,6 +1065,73 @@ mulGFNI_1x5_64_loop: mulGFNI_1x5_64_end: RET +// func mulAvxGFNI_1x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y9, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5_loop + VZEROUPPER + +mulAvxGFNI_1x5_end: + RET + // func mulGFNI_1x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x5_64Xor(SB), $0-88 @@ -700,6 +1211,85 @@ mulGFNI_1x5_64Xor_loop: mulGFNI_1x5_64Xor_end: RET +// func mulAvxGFNI_1x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x5Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, DX + + // Add start offset to input + ADDQ R9, CX + +mulAvxGFNI_1x5Xor_loop: + // Load 5 outputs + VMOVDQU (BX), Y5 + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (DX), Y9 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y5, Y11, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y6, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y7, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 5 outputs + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x5Xor_loop + VZEROUPPER + +mulAvxGFNI_1x5Xor_end: + RET + // func mulGFNI_1x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64(SB), $0-88 @@ -773,6 +1363,79 @@ mulGFNI_1x6_64_loop: mulGFNI_1x6_64_end: RET +// func mulAvxGFNI_1x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y11, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y11, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6_loop + VZEROUPPER + +mulAvxGFNI_1x6_end: + RET + // func mulGFNI_1x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x6_64Xor(SB), $0-88 @@ -860,6 +1523,93 @@ mulGFNI_1x6_64Xor_loop: mulGFNI_1x6_64Xor_end: RET +// func mulAvxGFNI_1x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x6Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DX + + // Add start offset to input + ADDQ R10, CX + +mulAvxGFNI_1x6Xor_loop: + // Load 6 outputs + VMOVDQU (BX), Y6 + VMOVDQU (SI), Y7 + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (DX), Y11 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y6, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y7, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 6 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x6Xor_loop + VZEROUPPER + +mulAvxGFNI_1x6Xor_end: + RET + // func mulGFNI_1x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64(SB), $0-88 @@ -939,6 +1689,85 @@ mulGFNI_1x7_64_loop: mulGFNI_1x7_64_end: RET +// func mulAvxGFNI_1x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y13, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7_loop + VZEROUPPER + +mulAvxGFNI_1x7_end: + RET + // func mulGFNI_1x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x7_64Xor(SB), $0-88 @@ -1034,6 +1863,101 @@ mulGFNI_1x7_64Xor_loop: mulGFNI_1x7_64Xor_end: RET +// func mulAvxGFNI_1x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x7Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DX + + // Add start offset to input + ADDQ R11, CX + +mulAvxGFNI_1x7Xor_loop: + // Load 7 outputs + VMOVDQU (BX), Y7 + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DX), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x7Xor_loop + VZEROUPPER + +mulAvxGFNI_1x7Xor_end: + RET + // func mulGFNI_1x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64(SB), $0-88 @@ -1119,6 +2043,91 @@ mulGFNI_1x8_64_loop: mulGFNI_1x8_64_end: RET +// func mulAvxGFNI_1x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y13, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 56(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8_loop + VZEROUPPER + +mulAvxGFNI_1x8_end: + RET + // func mulGFNI_1x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x8_64Xor(SB), $0-88 @@ -1222,6 +2231,109 @@ mulGFNI_1x8_64Xor_loop: mulGFNI_1x8_64Xor_end: RET +// func mulAvxGFNI_1x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x8Xor(SB), $0-88 + // Loading 6 of 8 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + +mulAvxGFNI_1x8Xor_loop: + // Load 8 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x8Xor_loop + VZEROUPPER + +mulAvxGFNI_1x8Xor_end: + RET + // func mulGFNI_1x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64(SB), $0-88 @@ -1313,6 +2425,97 @@ mulGFNI_1x9_64_loop: mulGFNI_1x9_64_end: RET +// func mulAvxGFNI_1x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9(SB), $0-88 + // Loading 5 of 9 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + +mulAvxGFNI_1x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y13, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 64(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x9_loop + VZEROUPPER + +mulAvxGFNI_1x9_end: + RET + // func mulGFNI_1x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_1x9_64Xor(SB), $0-88 @@ -1424,45 +2627,38 @@ mulGFNI_1x9_64Xor_loop: mulGFNI_1x9_64Xor_end: RET -// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_1x10_64(SB), $0-88 - // Loading all tables to registers +// func mulAvxGFNI_1x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x9Xor(SB), $0-88 + // Loading 5 of 9 tables to registers // Destination kept in GP registers - // Full registers estimated 22 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_1x10_64_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - MOVQ in_base+24(FP), CX - MOVQ (CX), CX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), R8 - MOVQ 96(DX), R9 - MOVQ 120(DX), R10 - MOVQ 144(DX), R11 - MOVQ 168(DX), R12 - MOVQ 192(DX), R13 - MOVQ 216(DX), DX - MOVQ start+72(FP), R14 + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 // Add start offset to output - ADDQ R14, BX ADDQ R14, SI ADDQ R14, DI ADDQ R14, R8 @@ -1471,59 +2667,80 @@ TEXT ·mulGFNI_1x10_64(SB), $0-88 ADDQ R14, R11 ADDQ R14, R12 ADDQ R14, R13 - ADDQ R14, DX + ADDQ R14, BX // Add start offset to input - ADDQ R14, CX + ADDQ R14, DX -mulGFNI_1x10_64_loop: - // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (CX), Z19 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 - VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 - VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 - VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 - VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 - VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 - VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 - VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 - VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 - VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 +mulAvxGFNI_1x9Xor_loop: + // Load 9 outputs + VMOVDQU (SI), Y5 + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 - // Store 10 outputs - VMOVDQU64 Z10, (BX) - ADDQ $0x40, BX - VMOVDQU64 Z11, (SI) - ADDQ $0x40, SI - VMOVDQU64 Z12, (DI) - ADDQ $0x40, DI - VMOVDQU64 Z13, (R8) - ADDQ $0x40, R8 - VMOVDQU64 Z14, (R9) - ADDQ $0x40, R9 - VMOVDQU64 Z15, (R10) - ADDQ $0x40, R10 - VMOVDQU64 Z16, (R11) - ADDQ $0x40, R11 - VMOVDQU64 Z17, (R12) - ADDQ $0x40, R12 - VMOVDQU64 Z18, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z19, (DX) - ADDQ $0x40, DX + // Store 9 outputs + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX // Prepare for next loop DECQ AX - JNZ mulGFNI_1x10_64_loop + JNZ mulAvxGFNI_1x9Xor_loop VZEROUPPER -mulGFNI_1x10_64_end: +mulAvxGFNI_1x9Xor_end: RET -// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// func mulGFNI_1x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 +TEXT ·mulGFNI_1x10_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 22 YMM used @@ -1531,7 +2748,7 @@ TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX - JZ mulGFNI_1x10_64Xor_end + JZ mulGFNI_1x10_64_end VBROADCASTF32X2 (CX), Z0 VBROADCASTF32X2 8(CX), Z1 VBROADCASTF32X2 16(CX), Z2 @@ -1573,42 +2790,236 @@ TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 // Add start offset to input ADDQ R14, CX -mulGFNI_1x10_64Xor_loop: - // Load 10 outputs - VMOVDQU64 (BX), Z10 - VMOVDQU64 (SI), Z11 - VMOVDQU64 (DI), Z12 - VMOVDQU64 (R8), Z13 - VMOVDQU64 (R9), Z14 - VMOVDQU64 (R10), Z15 - VMOVDQU64 (R11), Z16 - VMOVDQU64 (R12), Z17 - VMOVDQU64 (R13), Z18 - VMOVDQU64 (DX), Z19 - +mulGFNI_1x10_64_loop: // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (CX), Z20 + VMOVDQU64 (CX), Z19 ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 - VXORPD Z10, Z21, Z10 - VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 - VXORPD Z11, Z21, Z11 - VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 - VXORPD Z12, Z21, Z12 - VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 - VXORPD Z13, Z21, Z13 - VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 - VXORPD Z14, Z21, Z14 - VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 - VXORPD Z15, Z21, Z15 - VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 - VXORPD Z16, Z21, Z16 - VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 - VXORPD Z17, Z21, Z17 - VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 - VXORPD Z18, Z21, Z18 - VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 - VXORPD Z19, Z21, Z19 + VGF2P8AFFINEQB $0x00, Z0, Z19, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z19, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z19, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z19, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z19, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z19, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z19, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z19, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z19, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z19, Z19 + + // Store 10 outputs + VMOVDQU64 Z10, (BX) + ADDQ $0x40, BX + VMOVDQU64 Z11, (SI) + ADDQ $0x40, SI + VMOVDQU64 Z12, (DI) + ADDQ $0x40, DI + VMOVDQU64 Z13, (R8) + ADDQ $0x40, R8 + VMOVDQU64 Z14, (R9) + ADDQ $0x40, R9 + VMOVDQU64 Z15, (R10) + ADDQ $0x40, R10 + VMOVDQU64 Z16, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z17, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z18, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z19, (DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_1x10_64_loop + VZEROUPPER + +mulGFNI_1x10_64_end: + RET + +// func mulAvxGFNI_1x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y13, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y13, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y13, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y13, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y13, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y13, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y13, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y13, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y13, Y12 + VBROADCASTSD 72(CX), Y14 + VGF2P8AFFINEQB $0x00, Y14, Y13, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10_loop + VZEROUPPER + +mulAvxGFNI_1x10_end: + RET + +// func mulGFNI_1x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_1x10_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_1x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DX + + // Add start offset to input + ADDQ R14, CX + +mulGFNI_1x10_64Xor_loop: + // Load 10 outputs + VMOVDQU64 (BX), Z10 + VMOVDQU64 (SI), Z11 + VMOVDQU64 (DI), Z12 + VMOVDQU64 (R8), Z13 + VMOVDQU64 (R9), Z14 + VMOVDQU64 (R10), Z15 + VMOVDQU64 (R11), Z16 + VMOVDQU64 (R12), Z17 + VMOVDQU64 (R13), Z18 + VMOVDQU64 (DX), Z19 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (CX), Z20 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z0, Z20, Z21 + VXORPD Z10, Z21, Z10 + VGF2P8AFFINEQB $0x00, Z1, Z20, Z21 + VXORPD Z11, Z21, Z11 + VGF2P8AFFINEQB $0x00, Z2, Z20, Z21 + VXORPD Z12, Z21, Z12 + VGF2P8AFFINEQB $0x00, Z3, Z20, Z21 + VXORPD Z13, Z21, Z13 + VGF2P8AFFINEQB $0x00, Z4, Z20, Z21 + VXORPD Z14, Z21, Z14 + VGF2P8AFFINEQB $0x00, Z5, Z20, Z21 + VXORPD Z15, Z21, Z15 + VGF2P8AFFINEQB $0x00, Z6, Z20, Z21 + VXORPD Z16, Z21, Z16 + VGF2P8AFFINEQB $0x00, Z7, Z20, Z21 + VXORPD Z17, Z21, Z17 + VGF2P8AFFINEQB $0x00, Z8, Z20, Z21 + VXORPD Z18, Z21, Z18 + VGF2P8AFFINEQB $0x00, Z9, Z20, Z21 + VXORPD Z19, Z21, Z19 // Store 10 outputs VMOVDQU64 Z10, (BX) @@ -1640,6 +3051,125 @@ mulGFNI_1x10_64Xor_loop: mulGFNI_1x10_64Xor_end: RET +// func mulAvxGFNI_1x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_1x10Xor(SB), $0-88 + // Loading 4 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_1x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + +mulAvxGFNI_1x10Xor_loop: + // Load 10 outputs + VMOVDQU (SI), Y4 + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (BX), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_1x10Xor_loop + VZEROUPPER + +mulAvxGFNI_1x10Xor_end: + RET + // func mulGFNI_2x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64(SB), $0-88 @@ -1692,6 +3222,58 @@ mulGFNI_2x1_64_loop: mulGFNI_2x1_64_end: RET +// func mulAvxGFNI_2x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1_loop + VZEROUPPER + +mulAvxGFNI_2x1_end: + RET + // func mulGFNI_2x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x1_64Xor(SB), $0-88 @@ -1748,6 +3330,62 @@ mulGFNI_2x1_64Xor_loop: mulGFNI_2x1_64Xor_end: RET +// func mulAvxGFNI_2x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 5 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + +mulAvxGFNI_2x1Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y2 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y3 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y3 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y1, Y3, Y3 + VXORPD Y2, Y3, Y2 + + // Store 1 outputs + VMOVDQU Y2, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x1Xor_loop + VZEROUPPER + +mulAvxGFNI_2x1Xor_end: + RET + // func mulGFNI_2x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64(SB), $0-88 @@ -1809,6 +3447,67 @@ mulGFNI_2x2_64_loop: mulGFNI_2x2_64_end: RET +// func mulAvxGFNI_2x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2_loop + VZEROUPPER + +mulAvxGFNI_2x2_end: + RET + // func mulGFNI_2x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x2_64Xor(SB), $0-88 @@ -1876,6 +3575,73 @@ mulGFNI_2x2_64Xor_loop: mulGFNI_2x2_64Xor_end: RET +// func mulAvxGFNI_2x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + +mulAvxGFNI_2x2Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y4 + VMOVDQU (BX), Y5 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y6, Y7 + VXORPD Y4, Y7, Y4 + VGF2P8AFFINEQB $0x00, Y3, Y6, Y7 + VXORPD Y5, Y7, Y5 + + // Store 2 outputs + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x2Xor_loop + VZEROUPPER + +mulAvxGFNI_2x2Xor_end: + RET + // func mulGFNI_2x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64(SB), $0-88 @@ -1946,6 +3712,76 @@ mulGFNI_2x3_64_loop: mulGFNI_2x3_64_end: RET +// func mulAvxGFNI_2x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3_loop + VZEROUPPER + +mulAvxGFNI_2x3_end: + RET + // func mulGFNI_2x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x3_64Xor(SB), $0-88 @@ -2024,6 +3860,84 @@ mulGFNI_2x3_64Xor_loop: mulGFNI_2x3_64Xor_end: RET +// func mulAvxGFNI_2x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, CX + +mulAvxGFNI_2x3Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y6 + VMOVDQU (DI), Y7 + VMOVDQU (BX), Y8 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y9, Y10 + VXORPD Y6, Y10, Y6 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y10 + VXORPD Y7, Y10, Y7 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y10 + VXORPD Y8, Y10, Y8 + + // Store 3 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x3Xor_loop + VZEROUPPER + +mulAvxGFNI_2x3Xor_end: + RET + // func mulGFNI_2x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64(SB), $0-88 @@ -2103,6 +4017,85 @@ mulGFNI_2x4_64_loop: mulGFNI_2x4_64_end: RET +// func mulAvxGFNI_2x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4_loop + VZEROUPPER + +mulAvxGFNI_2x4_end: + RET + // func mulGFNI_2x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x4_64Xor(SB), $0-88 @@ -2192,6 +4185,95 @@ mulGFNI_2x4_64Xor_loop: mulGFNI_2x4_64Xor_end: RET +// func mulAvxGFNI_2x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x4Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, CX + +mulAvxGFNI_2x4Xor_loop: + // Load 4 outputs + VMOVDQU (SI), Y8 + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (BX), Y11 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y8, Y13, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 4 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x4Xor_loop + VZEROUPPER + +mulAvxGFNI_2x4Xor_end: + RET + // func mulGFNI_2x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64(SB), $0-88 @@ -2280,6 +4362,94 @@ mulGFNI_2x5_64_loop: mulGFNI_2x5_64_end: RET +// func mulAvxGFNI_2x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5_loop + VZEROUPPER + +mulAvxGFNI_2x5_end: + RET + // func mulGFNI_2x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x5_64Xor(SB), $0-88 @@ -2380,6 +4550,106 @@ mulGFNI_2x5_64Xor_loop: mulGFNI_2x5_64Xor_end: RET +// func mulAvxGFNI_2x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x5Xor(SB), $0-88 + // Loading 9 of 10 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + +mulAvxGFNI_2x5Xor_loop: + // Load 5 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x5Xor_loop + VZEROUPPER + +mulAvxGFNI_2x5Xor_end: + RET + // func mulGFNI_2x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64(SB), $0-88 @@ -2477,6 +4747,103 @@ mulGFNI_2x6_64_loop: mulGFNI_2x6_64_end: RET +// func mulAvxGFNI_2x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6_loop + VZEROUPPER + +mulAvxGFNI_2x6_end: + RET + // func mulGFNI_2x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x6_64Xor(SB), $0-88 @@ -2588,6 +4955,117 @@ mulGFNI_2x6_64Xor_loop: mulGFNI_2x6_64Xor_end: RET +// func mulAvxGFNI_2x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x6Xor(SB), $0-88 + // Loading 8 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + +mulAvxGFNI_2x6Xor_loop: + // Load 6 outputs + VMOVDQU (DI), Y8 + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x6Xor_loop + VZEROUPPER + +mulAvxGFNI_2x6Xor_end: + RET + // func mulGFNI_2x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64(SB), $0-88 @@ -2694,6 +5172,112 @@ mulGFNI_2x7_64_loop: mulGFNI_2x7_64_end: RET +// func mulAvxGFNI_2x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7_loop + VZEROUPPER + +mulAvxGFNI_2x7_end: + RET + // func mulGFNI_2x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x7_64Xor(SB), $0-88 @@ -2816,6 +5400,128 @@ mulGFNI_2x7_64Xor_loop: mulGFNI_2x7_64Xor_end: RET +// func mulAvxGFNI_2x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x7Xor(SB), $0-88 + // Loading 7 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + +mulAvxGFNI_2x7Xor_loop: + // Load 7 outputs + VMOVDQU (DI), Y7 + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x7Xor_loop + VZEROUPPER + +mulAvxGFNI_2x7Xor_end: + RET + // func mulGFNI_2x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64(SB), $0-88 @@ -2931,6 +5637,121 @@ mulGFNI_2x8_64_loop: mulGFNI_2x8_64_end: RET +// func mulAvxGFNI_2x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8_loop + VZEROUPPER + +mulAvxGFNI_2x8_end: + RET + // func mulGFNI_2x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x8_64Xor(SB), $0-88 @@ -3064,6 +5885,139 @@ mulGFNI_2x8_64Xor_loop: mulGFNI_2x8_64Xor_end: RET +// func mulAvxGFNI_2x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x8Xor(SB), $0-88 + // Loading 6 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + +mulAvxGFNI_2x8Xor_loop: + // Load 8 outputs + VMOVDQU (DI), Y6 + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x8Xor_loop + VZEROUPPER + +mulAvxGFNI_2x8Xor_end: + RET + // func mulGFNI_2x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64(SB), $0-88 @@ -3188,6 +6142,130 @@ mulGFNI_2x9_64_loop: mulGFNI_2x9_64_end: RET +// func mulAvxGFNI_2x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9_loop + VZEROUPPER + +mulAvxGFNI_2x9_end: + RET + // func mulGFNI_2x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x9_64Xor(SB), $0-88 @@ -3332,6 +6410,150 @@ mulGFNI_2x9_64Xor_loop: mulGFNI_2x9_64Xor_end: RET +// func mulAvxGFNI_2x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x9Xor(SB), $0-88 + // Loading 5 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + +mulAvxGFNI_2x9Xor_loop: + // Load 9 outputs + VMOVDQU (DI), Y5 + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x9Xor_loop + VZEROUPPER + +mulAvxGFNI_2x9Xor_end: + RET + // func mulGFNI_2x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64(SB), $0-88 @@ -3465,6 +6687,139 @@ mulGFNI_2x10_64_loop: mulGFNI_2x10_64_end: RET +// func mulAvxGFNI_2x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10_loop + VZEROUPPER + +mulAvxGFNI_2x10_end: + RET + // func mulGFNI_2x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_2x10_64Xor(SB), $0-88 @@ -3620,6 +6975,161 @@ mulGFNI_2x10_64Xor_loop: mulGFNI_2x10_64Xor_end: RET +// func mulAvxGFNI_2x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_2x10Xor(SB), $8-88 + // Loading 4 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_2x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + +mulAvxGFNI_2x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_2x10Xor_loop + VZEROUPPER + +mulAvxGFNI_2x10Xor_end: + RET + // func mulGFNI_3x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64(SB), $0-88 @@ -3681,6 +7191,67 @@ mulGFNI_3x1_64_loop: mulGFNI_3x1_64_end: RET +// func mulAvxGFNI_3x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1_loop + VZEROUPPER + +mulAvxGFNI_3x1_end: + RET + // func mulGFNI_3x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x1_64Xor(SB), $0-88 @@ -3746,6 +7317,71 @@ mulGFNI_3x1_64Xor_loop: mulGFNI_3x1_64Xor_end: RET +// func mulAvxGFNI_3x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + +mulAvxGFNI_3x1Xor_loop: + // Load 1 outputs + VMOVDQU (SI), Y3 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y2, Y4, Y4 + VXORPD Y3, Y4, Y3 + + // Store 1 outputs + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x1Xor_loop + VZEROUPPER + +mulAvxGFNI_3x1Xor_end: + RET + // func mulGFNI_3x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64(SB), $0-88 @@ -3819,6 +7455,79 @@ mulGFNI_3x2_64_loop: mulGFNI_3x2_64_end: RET +// func mulAvxGFNI_3x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2_loop + VZEROUPPER + +mulAvxGFNI_3x2_end: + RET + // func mulGFNI_3x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x2_64Xor(SB), $0-88 @@ -3898,6 +7607,85 @@ mulGFNI_3x2_64Xor_loop: mulGFNI_3x2_64Xor_end: RET +// func mulAvxGFNI_3x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, CX + +mulAvxGFNI_3x2Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y6 + VMOVDQU (SI), Y7 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y8, Y9 + VXORPD Y6, Y9, Y6 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y9 + VXORPD Y7, Y9, Y7 + + // Store 2 outputs + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x2Xor_loop + VZEROUPPER + +mulAvxGFNI_3x2Xor_end: + RET + // func mulGFNI_3x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64(SB), $0-88 @@ -3983,6 +7771,91 @@ mulGFNI_3x3_64_loop: mulGFNI_3x3_64_end: RET +// func mulAvxGFNI_3x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3_loop + VZEROUPPER + +mulAvxGFNI_3x3_end: + RET + // func mulGFNI_3x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x3_64Xor(SB), $0-88 @@ -4076,6 +7949,99 @@ mulGFNI_3x3_64Xor_loop: mulGFNI_3x3_64Xor_end: RET +// func mulAvxGFNI_3x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x3Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, CX + +mulAvxGFNI_3x3Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y9 + VMOVDQU (R8), Y10 + VMOVDQU (SI), Y11 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y9, Y13, Y9 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 3 outputs + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x3Xor_loop + VZEROUPPER + +mulAvxGFNI_3x3Xor_end: + RET + // func mulGFNI_3x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64(SB), $0-88 @@ -4173,6 +8139,103 @@ mulGFNI_3x4_64_loop: mulGFNI_3x4_64_end: RET +// func mulAvxGFNI_3x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4_loop + VZEROUPPER + +mulAvxGFNI_3x4_end: + RET + // func mulGFNI_3x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x4_64Xor(SB), $0-88 @@ -4280,6 +8343,113 @@ mulGFNI_3x4_64Xor_loop: mulGFNI_3x4_64Xor_end: RET +// func mulAvxGFNI_3x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x4Xor(SB), $0-88 + // Loading 10 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + +mulAvxGFNI_3x4Xor_loop: + // Load 4 outputs + VMOVDQU (R8), Y10 + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x4Xor_loop + VZEROUPPER + +mulAvxGFNI_3x4Xor_end: + RET + // func mulGFNI_3x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64(SB), $0-88 @@ -4389,6 +8559,115 @@ mulGFNI_3x5_64_loop: mulGFNI_3x5_64_end: RET +// func mulAvxGFNI_3x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5_loop + VZEROUPPER + +mulAvxGFNI_3x5_end: + RET + // func mulGFNI_3x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x5_64Xor(SB), $0-88 @@ -4510,6 +8789,127 @@ mulGFNI_3x5_64Xor_loop: mulGFNI_3x5_64Xor_end: RET +// func mulAvxGFNI_3x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x5Xor(SB), $0-88 + // Loading 9 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + +mulAvxGFNI_3x5Xor_loop: + // Load 5 outputs + VMOVDQU (R8), Y9 + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R8) + ADDQ $0x20, R8 + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x5Xor_loop + VZEROUPPER + +mulAvxGFNI_3x5Xor_end: + RET + // func mulGFNI_3x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64(SB), $0-88 @@ -4631,6 +9031,127 @@ mulGFNI_3x6_64_loop: mulGFNI_3x6_64_end: RET +// func mulAvxGFNI_3x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6_loop + VZEROUPPER + +mulAvxGFNI_3x6_end: + RET + // func mulGFNI_3x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x6_64Xor(SB), $0-88 @@ -4766,6 +9287,141 @@ mulGFNI_3x6_64Xor_loop: mulGFNI_3x6_64Xor_end: RET +// func mulAvxGFNI_3x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x6Xor(SB), $0-88 + // Loading 8 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + +mulAvxGFNI_3x6Xor_loop: + // Load 6 outputs + VMOVDQU (R8), Y8 + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x6Xor_loop + VZEROUPPER + +mulAvxGFNI_3x6Xor_end: + RET + // func mulGFNI_3x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64(SB), $0-88 @@ -4899,6 +9555,139 @@ mulGFNI_3x7_64_loop: mulGFNI_3x7_64_end: RET +// func mulAvxGFNI_3x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7_loop + VZEROUPPER + +mulAvxGFNI_3x7_end: + RET + // func mulGFNI_3x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x7_64Xor(SB), $0-88 @@ -5048,6 +9837,155 @@ mulGFNI_3x7_64Xor_loop: mulGFNI_3x7_64Xor_end: RET +// func mulAvxGFNI_3x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x7Xor(SB), $0-88 + // Loading 7 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + +mulAvxGFNI_3x7Xor_loop: + // Load 7 outputs + VMOVDQU (R8), Y7 + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x7Xor_loop + VZEROUPPER + +mulAvxGFNI_3x7Xor_end: + RET + // func mulGFNI_3x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64(SB), $0-88 @@ -5191,6 +10129,151 @@ mulGFNI_3x8_64_loop: mulGFNI_3x8_64_end: RET +// func mulAvxGFNI_3x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8_loop + VZEROUPPER + +mulAvxGFNI_3x8_end: + RET + // func mulGFNI_3x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x8_64Xor(SB), $0-88 @@ -5352,6 +10435,169 @@ mulGFNI_3x8_64Xor_loop: mulGFNI_3x8_64Xor_end: RET +// func mulAvxGFNI_3x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x8Xor(SB), $0-88 + // Loading 6 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + +mulAvxGFNI_3x8Xor_loop: + // Load 8 outputs + VMOVDQU (R8), Y6 + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x8Xor_loop + VZEROUPPER + +mulAvxGFNI_3x8Xor_end: + RET + // func mulGFNI_3x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64(SB), $8-88 @@ -5503,6 +10749,163 @@ mulGFNI_3x9_64_loop: mulGFNI_3x9_64_end: RET +// func mulAvxGFNI_3x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9_loop + VZEROUPPER + +mulAvxGFNI_3x9_end: + RET + // func mulGFNI_3x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x9_64Xor(SB), $8-88 @@ -5674,6 +11077,183 @@ mulGFNI_3x9_64Xor_loop: mulGFNI_3x9_64Xor_end: RET +// func mulAvxGFNI_3x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x9Xor(SB), $8-88 + // Loading 5 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + +mulAvxGFNI_3x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_3x9Xor_loop + VZEROUPPER + +mulAvxGFNI_3x9Xor_end: + RET + // func mulGFNI_3x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64(SB), $8-88 @@ -5837,6 +11417,179 @@ mulGFNI_3x10_64_loop: mulGFNI_3x10_64_end: RET +// func mulAvxGFNI_3x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10_loop + VZEROUPPER + +mulAvxGFNI_3x10_end: + RET + // func mulGFNI_3x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_3x10_64Xor(SB), $8-88 @@ -6022,6 +11775,201 @@ mulGFNI_3x10_64Xor_loop: mulGFNI_3x10_64Xor_end: RET +// func mulAvxGFNI_3x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_3x10Xor(SB), $8-88 + // Loading 4 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_3x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_3x10Xor_loop: + // Load 10 outputs + VMOVDQU (DI), Y4 + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (SI), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_3x10Xor_loop + VZEROUPPER + +mulAvxGFNI_3x10Xor_end: + RET + // func mulGFNI_4x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64(SB), $0-88 @@ -6092,6 +12040,76 @@ mulGFNI_4x1_64_loop: mulGFNI_4x1_64_end: RET +// func mulAvxGFNI_4x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1_loop + VZEROUPPER + +mulAvxGFNI_4x1_end: + RET + // func mulGFNI_4x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x1_64Xor(SB), $0-88 @@ -6166,6 +12184,80 @@ mulGFNI_4x1_64Xor_loop: mulGFNI_4x1_64Xor_end: RET +// func mulAvxGFNI_4x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 7 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + +mulAvxGFNI_4x1Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y4 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y5 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y3, Y5, Y5 + VXORPD Y4, Y5, Y4 + + // Store 1 outputs + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x1Xor_loop + VZEROUPPER + +mulAvxGFNI_4x1Xor_end: + RET + // func mulGFNI_4x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64(SB), $0-88 @@ -6251,6 +12343,91 @@ mulGFNI_4x2_64_loop: mulGFNI_4x2_64_end: RET +// func mulAvxGFNI_4x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2_loop + VZEROUPPER + +mulAvxGFNI_4x2_end: + RET + // func mulGFNI_4x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x2_64Xor(SB), $0-88 @@ -6342,6 +12519,97 @@ mulGFNI_4x2_64Xor_loop: mulGFNI_4x2_64Xor_end: RET +// func mulAvxGFNI_4x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, CX + +mulAvxGFNI_4x2Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y8 + VMOVDQU (DI), Y9 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y10, Y11 + VXORPD Y8, Y11, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y11 + VXORPD Y9, Y11, Y9 + + // Store 2 outputs + VMOVDQU Y8, (R8) + ADDQ $0x20, R8 + VMOVDQU Y9, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x2Xor_loop + VZEROUPPER + +mulAvxGFNI_4x2Xor_end: + RET + // func mulGFNI_4x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64(SB), $0-88 @@ -6442,6 +12710,106 @@ mulGFNI_4x3_64_loop: mulGFNI_4x3_64_end: RET +// func mulAvxGFNI_4x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3_loop + VZEROUPPER + +mulAvxGFNI_4x3_end: + RET + // func mulGFNI_4x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x3_64Xor(SB), $0-88 @@ -6550,6 +12918,114 @@ mulGFNI_4x3_64Xor_loop: mulGFNI_4x3_64Xor_end: RET +// func mulAvxGFNI_4x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x3Xor(SB), $0-88 + // Loading 11 of 12 tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + +mulAvxGFNI_4x3Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU (R10), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R9) + ADDQ $0x20, R9 + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x3Xor_loop + VZEROUPPER + +mulAvxGFNI_4x3Xor_end: + RET + // func mulGFNI_4x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64(SB), $0-88 @@ -6665,6 +13141,121 @@ mulGFNI_4x4_64_loop: mulGFNI_4x4_64_end: RET +// func mulAvxGFNI_4x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4_loop + VZEROUPPER + +mulAvxGFNI_4x4_end: + RET + // func mulGFNI_4x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x4_64Xor(SB), $0-88 @@ -6790,6 +13381,131 @@ mulGFNI_4x4_64Xor_loop: mulGFNI_4x4_64Xor_end: RET +// func mulAvxGFNI_4x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x4Xor(SB), $0-88 + // Loading 10 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + +mulAvxGFNI_4x4Xor_loop: + // Load 4 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x4Xor_loop + VZEROUPPER + +mulAvxGFNI_4x4Xor_end: + RET + // func mulGFNI_4x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64(SB), $0-88 @@ -6920,6 +13636,136 @@ mulGFNI_4x5_64_loop: mulGFNI_4x5_64_end: RET +// func mulAvxGFNI_4x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5_loop + VZEROUPPER + +mulAvxGFNI_4x5_end: + RET + // func mulGFNI_4x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x5_64Xor(SB), $0-88 @@ -7062,6 +13908,148 @@ mulGFNI_4x5_64Xor_loop: mulGFNI_4x5_64Xor_end: RET +// func mulAvxGFNI_4x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x5Xor(SB), $0-88 + // Loading 9 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + +mulAvxGFNI_4x5Xor_loop: + // Load 5 outputs + VMOVDQU (R9), Y9 + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R9) + ADDQ $0x20, R9 + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x5Xor_loop + VZEROUPPER + +mulAvxGFNI_4x5Xor_end: + RET + // func mulGFNI_4x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64(SB), $0-88 @@ -7207,6 +14195,151 @@ mulGFNI_4x6_64_loop: mulGFNI_4x6_64_end: RET +// func mulAvxGFNI_4x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6_loop + VZEROUPPER + +mulAvxGFNI_4x6_end: + RET + // func mulGFNI_4x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x6_64Xor(SB), $0-88 @@ -7366,6 +14499,165 @@ mulGFNI_4x6_64Xor_loop: mulGFNI_4x6_64Xor_end: RET +// func mulAvxGFNI_4x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x6Xor(SB), $0-88 + // Loading 8 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + +mulAvxGFNI_4x6Xor_loop: + // Load 6 outputs + VMOVDQU (R9), Y8 + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R9) + ADDQ $0x20, R9 + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x6Xor_loop + VZEROUPPER + +mulAvxGFNI_4x6Xor_end: + RET + // func mulGFNI_4x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64(SB), $0-88 @@ -7521,6 +14813,166 @@ mulGFNI_4x7_64_loop: mulGFNI_4x7_64_end: RET +// func mulAvxGFNI_4x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7_loop + VZEROUPPER + +mulAvxGFNI_4x7_end: + RET + // func mulGFNI_4x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x7_64Xor(SB), $0-88 @@ -7692,6 +15144,182 @@ mulGFNI_4x7_64Xor_loop: mulGFNI_4x7_64Xor_end: RET +// func mulAvxGFNI_4x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x7Xor(SB), $0-88 + // Loading 7 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + +mulAvxGFNI_4x7Xor_loop: + // Load 7 outputs + VMOVDQU (R9), Y7 + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R9) + ADDQ $0x20, R9 + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x7Xor_loop + VZEROUPPER + +mulAvxGFNI_4x7Xor_end: + RET + // func mulGFNI_4x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64(SB), $8-88 @@ -7857,6 +15485,181 @@ mulGFNI_4x8_64_loop: mulGFNI_4x8_64_end: RET +// func mulAvxGFNI_4x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8_loop + VZEROUPPER + +mulAvxGFNI_4x8_end: + RET + // func mulGFNI_4x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x8_64Xor(SB), $8-88 @@ -8040,6 +15843,199 @@ mulGFNI_4x8_64Xor_loop: mulGFNI_4x8_64Xor_end: RET +// func mulAvxGFNI_4x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x8Xor(SB), $8-88 + // Loading 6 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + +mulAvxGFNI_4x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_4x8Xor_loop + VZEROUPPER + +mulAvxGFNI_4x8Xor_end: + RET + // func mulGFNI_4x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64(SB), $8-88 @@ -8219,6 +16215,200 @@ mulGFNI_4x9_64_loop: mulGFNI_4x9_64_end: RET +// func mulAvxGFNI_4x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9_loop + VZEROUPPER + +mulAvxGFNI_4x9_end: + RET + // func mulGFNI_4x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x9_64Xor(SB), $8-88 @@ -8418,6 +16608,220 @@ mulGFNI_4x9_64Xor_loop: mulGFNI_4x9_64Xor_end: RET +// func mulAvxGFNI_4x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x9Xor(SB), $8-88 + // Loading 5 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_4x9Xor_loop: + // Load 9 outputs + VMOVDQU (R8), Y5 + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (DI), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_4x9Xor_loop + VZEROUPPER + +mulAvxGFNI_4x9Xor_end: + RET + // func mulGFNI_4x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_4x10_64(SB), $0-88 @@ -8576,51 +16980,415 @@ mulGFNI_4x10_64_loop: // Prepare for next loop ADDQ $0x40, R9 DECQ AX - JNZ mulGFNI_4x10_64_loop + JNZ mulGFNI_4x10_64_loop + VZEROUPPER + +mulGFNI_4x10_64_end: + RET + +// func mulAvxGFNI_4x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10(SB), $0-88 + // Loading 4 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulAvxGFNI_4x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxGFNI_4x10_loop + VZEROUPPER + +mulAvxGFNI_4x10_end: + RET + +// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 + // Loading 20 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_4x10_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + +mulGFNI_4x10_64Xor_loop: + // Load 10 outputs + MOVQ (R8), R10 + VMOVDQU64 (R10)(R9*1), Z20 + MOVQ 24(R8), R10 + VMOVDQU64 (R10)(R9*1), Z21 + MOVQ 48(R8), R10 + VMOVDQU64 (R10)(R9*1), Z22 + MOVQ 72(R8), R10 + VMOVDQU64 (R10)(R9*1), Z23 + MOVQ 96(R8), R10 + VMOVDQU64 (R10)(R9*1), Z24 + MOVQ 120(R8), R10 + VMOVDQU64 (R10)(R9*1), Z25 + MOVQ 144(R8), R10 + VMOVDQU64 (R10)(R9*1), Z26 + MOVQ 168(R8), R10 + VMOVDQU64 (R10)(R9*1), Z27 + MOVQ 192(R8), R10 + VMOVDQU64 (R10)(R9*1), Z28 + MOVQ 216(R8), R10 + VMOVDQU64 (R10)(R9*1), Z29 + + // Load and process 64 bytes from input 0 to 10 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 1 to 10 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 10 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 10 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z20, Z31, Z20 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z21, Z31, Z21 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z22, Z31, Z22 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z23, Z31, Z23 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU64 Z20, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU64 Z21, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU64 Z22, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU64 Z23, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU64 Z24, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU64 Z25, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU64 Z26, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU64 Z27, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU64 Z28, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU64 Z29, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x40, R9 + DECQ AX + JNZ mulGFNI_4x10_64Xor_loop VZEROUPPER -mulGFNI_4x10_64_end: +mulGFNI_4x10_64Xor_end: RET -// func mulGFNI_4x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 - // Loading 20 of 40 tables to registers +// func mulAvxGFNI_4x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_4x10Xor(SB), $0-88 + // Loading 4 of 40 tables to registers // Destination kept on stack // Full registers estimated 52 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_4x10_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - MOVQ in_base+24(FP), DX - MOVQ (DX), BX - MOVQ 24(DX), SI - MOVQ 48(DX), DI - MOVQ 72(DX), DX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_4x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 // Add start offset to input ADDQ R9, BX @@ -8628,154 +17396,190 @@ TEXT ·mulGFNI_4x10_64Xor(SB), $0-88 ADDQ R9, DI ADDQ R9, DX -mulGFNI_4x10_64Xor_loop: +mulAvxGFNI_4x10Xor_loop: // Load 10 outputs - MOVQ (R8), R10 - VMOVDQU64 (R10)(R9*1), Z20 - MOVQ 24(R8), R10 - VMOVDQU64 (R10)(R9*1), Z21 - MOVQ 48(R8), R10 - VMOVDQU64 (R10)(R9*1), Z22 - MOVQ 72(R8), R10 - VMOVDQU64 (R10)(R9*1), Z23 - MOVQ 96(R8), R10 - VMOVDQU64 (R10)(R9*1), Z24 - MOVQ 120(R8), R10 - VMOVDQU64 (R10)(R9*1), Z25 - MOVQ 144(R8), R10 - VMOVDQU64 (R10)(R9*1), Z26 - MOVQ 168(R8), R10 - VMOVDQU64 (R10)(R9*1), Z27 - MOVQ 192(R8), R10 - VMOVDQU64 (R10)(R9*1), Z28 - MOVQ 216(R8), R10 - VMOVDQU64 (R10)(R9*1), Z29 - - // Load and process 64 bytes from input 0 to 10 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 10 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 10 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB.BCST $0x00, 160(CX), Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB.BCST $0x00, 168(CX), Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB.BCST $0x00, 176(CX), Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB.BCST $0x00, 184(CX), Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 10 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 - VXORPD Z20, Z31, Z20 - VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 - VXORPD Z21, Z31, Z21 - VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 - VXORPD Z22, Z31, Z22 - VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 - VXORPD Z23, Z31, Z23 - VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y4 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y10 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y11 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y12 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 10 outputs - MOVQ (R8), R10 - VMOVDQU64 Z20, (R10)(R9*1) - MOVQ 24(R8), R10 - VMOVDQU64 Z21, (R10)(R9*1) - MOVQ 48(R8), R10 - VMOVDQU64 Z22, (R10)(R9*1) - MOVQ 72(R8), R10 - VMOVDQU64 Z23, (R10)(R9*1) - MOVQ 96(R8), R10 - VMOVDQU64 Z24, (R10)(R9*1) - MOVQ 120(R8), R10 - VMOVDQU64 Z25, (R10)(R9*1) - MOVQ 144(R8), R10 - VMOVDQU64 Z26, (R10)(R9*1) - MOVQ 168(R8), R10 - VMOVDQU64 Z27, (R10)(R9*1) - MOVQ 192(R8), R10 - VMOVDQU64 Z28, (R10)(R9*1) - MOVQ 216(R8), R10 - VMOVDQU64 Z29, (R10)(R9*1) + MOVQ (R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y10, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y11, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y12, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y13, (R10)(R9*1) // Prepare for next loop - ADDQ $0x40, R9 + ADDQ $0x20, R9 DECQ AX - JNZ mulGFNI_4x10_64Xor_loop + JNZ mulAvxGFNI_4x10Xor_loop VZEROUPPER -mulGFNI_4x10_64Xor_end: +mulAvxGFNI_4x10Xor_end: RET // func mulGFNI_5x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -8857,6 +17661,85 @@ mulGFNI_5x1_64_loop: mulGFNI_5x1_64_end: RET +// func mulAvxGFNI_5x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1_loop + VZEROUPPER + +mulAvxGFNI_5x1_end: + RET + // func mulGFNI_5x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x1_64Xor(SB), $0-88 @@ -8940,6 +17823,89 @@ mulGFNI_5x1_64Xor_loop: mulGFNI_5x1_64Xor_end: RET +// func mulAvxGFNI_5x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + +mulAvxGFNI_5x1Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y5 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y4, Y6, Y6 + VXORPD Y5, Y6, Y5 + + // Store 1 outputs + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x1Xor_loop + VZEROUPPER + +mulAvxGFNI_5x1Xor_end: + RET + // func mulGFNI_5x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64(SB), $0-88 @@ -9037,6 +18003,103 @@ mulGFNI_5x2_64_loop: mulGFNI_5x2_64_end: RET +// func mulAvxGFNI_5x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2_loop + VZEROUPPER + +mulAvxGFNI_5x2_end: + RET + // func mulGFNI_5x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x2_64Xor(SB), $0-88 @@ -9140,6 +18203,109 @@ mulGFNI_5x2_64Xor_loop: mulGFNI_5x2_64Xor_end: RET +// func mulAvxGFNI_5x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, CX + +mulAvxGFNI_5x2Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y10 + VMOVDQU (R8), Y11 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y12, Y13 + VXORPD Y10, Y13, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y12, Y13 + VXORPD Y11, Y13, Y11 + + // Store 2 outputs + VMOVDQU Y10, (R9) + ADDQ $0x20, R9 + VMOVDQU Y11, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x2Xor_loop + VZEROUPPER + +mulAvxGFNI_5x2Xor_end: + RET + // func mulGFNI_5x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64(SB), $0-88 @@ -9255,6 +18421,121 @@ mulGFNI_5x3_64_loop: mulGFNI_5x3_64_end: RET +// func mulAvxGFNI_5x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3_loop + VZEROUPPER + +mulAvxGFNI_5x3_end: + RET + // func mulGFNI_5x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x3_64Xor(SB), $0-88 @@ -9378,6 +18659,129 @@ mulGFNI_5x3_64Xor_loop: mulGFNI_5x3_64Xor_end: RET +// func mulAvxGFNI_5x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x3Xor(SB), $0-88 + // Loading 11 of 15 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + +mulAvxGFNI_5x3Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU (R11), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R10) + ADDQ $0x20, R10 + VMOVDQU Y12, (R11) + ADDQ $0x20, R11 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x3Xor_loop + VZEROUPPER + +mulAvxGFNI_5x3Xor_end: + RET + // func mulGFNI_5x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64(SB), $0-88 @@ -9511,6 +18915,139 @@ mulGFNI_5x4_64_loop: mulGFNI_5x4_64_end: RET +// func mulAvxGFNI_5x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4_loop + VZEROUPPER + +mulAvxGFNI_5x4_end: + RET + // func mulGFNI_5x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x4_64Xor(SB), $0-88 @@ -9654,6 +19191,149 @@ mulGFNI_5x4_64Xor_loop: mulGFNI_5x4_64Xor_end: RET +// func mulAvxGFNI_5x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x4Xor(SB), $0-88 + // Loading 10 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + +mulAvxGFNI_5x4Xor_loop: + // Load 4 outputs + VMOVDQU (R10), Y10 + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R10) + ADDQ $0x20, R10 + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x4Xor_loop + VZEROUPPER + +mulAvxGFNI_5x4Xor_end: + RET + // func mulGFNI_5x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64(SB), $0-88 @@ -9805,6 +19485,157 @@ mulGFNI_5x5_64_loop: mulGFNI_5x5_64_end: RET +// func mulAvxGFNI_5x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5_loop + VZEROUPPER + +mulAvxGFNI_5x5_end: + RET + // func mulGFNI_5x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x5_64Xor(SB), $0-88 @@ -9968,6 +19799,169 @@ mulGFNI_5x5_64Xor_loop: mulGFNI_5x5_64Xor_end: RET +// func mulAvxGFNI_5x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x5Xor(SB), $0-88 + // Loading 9 of 25 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + +mulAvxGFNI_5x5Xor_loop: + // Load 5 outputs + VMOVDQU (R10), Y9 + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R10) + ADDQ $0x20, R10 + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x5Xor_loop + VZEROUPPER + +mulAvxGFNI_5x5Xor_end: + RET + // func mulGFNI_5x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64(SB), $0-88 @@ -10131,6 +20125,175 @@ mulGFNI_5x6_64_loop: mulGFNI_5x6_64_end: RET +// func mulAvxGFNI_5x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6_loop + VZEROUPPER + +mulAvxGFNI_5x6_end: + RET + // func mulGFNI_5x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x6_64Xor(SB), $0-88 @@ -10308,6 +20471,189 @@ mulGFNI_5x6_64Xor_loop: mulGFNI_5x6_64Xor_end: RET +// func mulAvxGFNI_5x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x6Xor(SB), $0-88 + // Loading 8 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + +mulAvxGFNI_5x6Xor_loop: + // Load 6 outputs + VMOVDQU (R10), Y8 + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R10) + ADDQ $0x20, R10 + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x6Xor_loop + VZEROUPPER + +mulAvxGFNI_5x6Xor_end: + RET + // func mulGFNI_5x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64(SB), $8-88 @@ -10483,6 +20829,193 @@ mulGFNI_5x7_64_loop: mulGFNI_5x7_64_end: RET +// func mulAvxGFNI_5x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7_loop + VZEROUPPER + +mulAvxGFNI_5x7_end: + RET + // func mulGFNI_5x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x7_64Xor(SB), $8-88 @@ -10674,6 +21207,209 @@ mulGFNI_5x7_64Xor_loop: mulGFNI_5x7_64Xor_end: RET +// func mulAvxGFNI_5x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x7Xor(SB), $8-88 + // Loading 7 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + +mulAvxGFNI_5x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_5x7Xor_loop + VZEROUPPER + +mulAvxGFNI_5x7Xor_end: + RET + // func mulGFNI_5x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64(SB), $8-88 @@ -10865,6 +21601,215 @@ mulGFNI_5x8_64_loop: mulGFNI_5x8_64_end: RET +// func mulAvxGFNI_5x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8_loop + VZEROUPPER + +mulAvxGFNI_5x8_end: + RET + // func mulGFNI_5x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x8_64Xor(SB), $8-88 @@ -11074,6 +22019,233 @@ mulGFNI_5x8_64Xor_loop: mulGFNI_5x8_64Xor_end: RET +// func mulAvxGFNI_5x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x8Xor(SB), $8-88 + // Loading 6 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_5x8Xor_loop: + // Load 8 outputs + VMOVDQU (R9), Y6 + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R8), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_5x8Xor_loop + VZEROUPPER + +mulAvxGFNI_5x8Xor_end: + RET + // func mulGFNI_5x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64(SB), $0-88 @@ -11254,6 +22426,210 @@ mulGFNI_5x9_64_loop: mulGFNI_5x9_64_end: RET +// func mulAvxGFNI_5x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9_loop + VZEROUPPER + +mulAvxGFNI_5x9_end: + RET + // func mulGFNI_5x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x9_64Xor(SB), $0-88 @@ -11463,6 +22839,239 @@ mulGFNI_5x9_64Xor_loop: mulGFNI_5x9_64Xor_end: RET +// func mulAvxGFNI_5x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x9Xor(SB), $0-88 + // Loading 5 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x9Xor_loop: + // Load 9 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x9Xor_loop + VZEROUPPER + +mulAvxGFNI_5x9Xor_end: + RET + // func mulGFNI_5x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64(SB), $0-88 @@ -11653,6 +23262,226 @@ mulGFNI_5x10_64_loop: mulGFNI_5x10_64_end: RET +// func mulAvxGFNI_5x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10_loop + VZEROUPPER + +mulAvxGFNI_5x10_end: + RET + // func mulGFNI_5x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_5x10_64Xor(SB), $0-88 @@ -11875,6 +23704,258 @@ mulGFNI_5x10_64Xor_loop: mulGFNI_5x10_64Xor_end: RET +// func mulAvxGFNI_5x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_5x10Xor(SB), $0-88 + // Loading 4 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_5x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + +mulAvxGFNI_5x10Xor_loop: + // Load 10 outputs + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y4 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y10 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y11 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y12 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y10, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y11, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y12, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y13, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxGFNI_5x10Xor_loop + VZEROUPPER + +mulAvxGFNI_5x10Xor_end: + RET + // func mulGFNI_6x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64(SB), $0-88 @@ -11963,6 +24044,94 @@ mulGFNI_6x1_64_loop: mulGFNI_6x1_64_end: RET +// func mulAvxGFNI_6x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1_loop + VZEROUPPER + +mulAvxGFNI_6x1_end: + RET + // func mulGFNI_6x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x1_64Xor(SB), $0-88 @@ -12055,6 +24224,98 @@ mulGFNI_6x1_64Xor_loop: mulGFNI_6x1_64Xor_end: RET +// func mulAvxGFNI_6x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 9 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + +mulAvxGFNI_6x1Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y6 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y7 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y5, Y7, Y7 + VXORPD Y6, Y7, Y6 + + // Store 1 outputs + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x1Xor_loop + VZEROUPPER + +mulAvxGFNI_6x1Xor_end: + RET + // func mulGFNI_6x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64(SB), $0-88 @@ -12164,6 +24425,115 @@ mulGFNI_6x2_64_loop: mulGFNI_6x2_64_end: RET +// func mulAvxGFNI_6x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2_loop + VZEROUPPER + +mulAvxGFNI_6x2_end: + RET + // func mulGFNI_6x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x2_64Xor(SB), $0-88 @@ -12279,6 +24649,121 @@ mulGFNI_6x2_64Xor_loop: mulGFNI_6x2_64Xor_end: RET +// func mulAvxGFNI_6x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x2Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, CX + +mulAvxGFNI_6x2Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R10) + ADDQ $0x20, R10 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x2Xor_loop + VZEROUPPER + +mulAvxGFNI_6x2Xor_end: + RET + // func mulGFNI_6x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64(SB), $0-88 @@ -12409,6 +24894,136 @@ mulGFNI_6x3_64_loop: mulGFNI_6x3_64_end: RET +// func mulAvxGFNI_6x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3_loop + VZEROUPPER + +mulAvxGFNI_6x3_end: + RET + // func mulGFNI_6x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x3_64Xor(SB), $0-88 @@ -12547,6 +25162,144 @@ mulGFNI_6x3_64Xor_loop: mulGFNI_6x3_64Xor_end: RET +// func mulAvxGFNI_6x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x3Xor(SB), $0-88 + // Loading 11 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + +mulAvxGFNI_6x3Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU (R12), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R11) + ADDQ $0x20, R11 + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x3Xor_loop + VZEROUPPER + +mulAvxGFNI_6x3Xor_end: + RET + // func mulGFNI_6x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64(SB), $0-88 @@ -12698,6 +25451,157 @@ mulGFNI_6x4_64_loop: mulGFNI_6x4_64_end: RET +// func mulAvxGFNI_6x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4_loop + VZEROUPPER + +mulAvxGFNI_6x4_end: + RET + // func mulGFNI_6x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x4_64Xor(SB), $0-88 @@ -12859,6 +25763,167 @@ mulGFNI_6x4_64Xor_loop: mulGFNI_6x4_64Xor_end: RET +// func mulAvxGFNI_6x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x4Xor(SB), $0-88 + // Loading 10 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + +mulAvxGFNI_6x4Xor_loop: + // Load 4 outputs + VMOVDQU (R11), Y10 + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R11) + ADDQ $0x20, R11 + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x4Xor_loop + VZEROUPPER + +mulAvxGFNI_6x4Xor_end: + RET + // func mulGFNI_6x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64(SB), $0-88 @@ -13026,6 +26091,178 @@ mulGFNI_6x5_64_loop: mulGFNI_6x5_64_end: RET +// func mulAvxGFNI_6x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5_loop + VZEROUPPER + +mulAvxGFNI_6x5_end: + RET + // func mulGFNI_6x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x5_64Xor(SB), $0-88 @@ -13205,6 +26442,190 @@ mulGFNI_6x5_64Xor_loop: mulGFNI_6x5_64Xor_end: RET +// func mulAvxGFNI_6x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x5Xor(SB), $0-88 + // Loading 9 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 37 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + +mulAvxGFNI_6x5Xor_loop: + // Load 5 outputs + VMOVDQU (R11), Y9 + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R11) + ADDQ $0x20, R11 + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x5Xor_loop + VZEROUPPER + +mulAvxGFNI_6x5Xor_end: + RET + // func mulGFNI_6x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64(SB), $8-88 @@ -13386,6 +26807,199 @@ mulGFNI_6x6_64_loop: mulGFNI_6x6_64_end: RET +// func mulAvxGFNI_6x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6_loop + VZEROUPPER + +mulAvxGFNI_6x6_end: + RET + // func mulGFNI_6x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x6_64Xor(SB), $8-88 @@ -13581,6 +27195,213 @@ mulGFNI_6x6_64Xor_loop: mulGFNI_6x6_64Xor_end: RET +// func mulAvxGFNI_6x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x6Xor(SB), $8-88 + // Loading 8 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + +mulAvxGFNI_6x6Xor_loop: + // Load 6 outputs + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_6x6Xor_loop + VZEROUPPER + +mulAvxGFNI_6x6Xor_end: + RET + // func mulGFNI_6x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64(SB), $8-88 @@ -13780,6 +27601,224 @@ mulGFNI_6x7_64_loop: mulGFNI_6x7_64_end: RET +// func mulAvxGFNI_6x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7_loop + VZEROUPPER + +mulAvxGFNI_6x7_end: + RET + // func mulGFNI_6x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x7_64Xor(SB), $8-88 @@ -13995,6 +28034,240 @@ mulGFNI_6x7_64Xor_loop: mulGFNI_6x7_64Xor_end: RET +// func mulAvxGFNI_6x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x7Xor(SB), $8-88 + // Loading 7 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 51 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_6x7Xor_loop: + // Load 7 outputs + VMOVDQU (R10), Y7 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R9), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_6x7Xor_loop + VZEROUPPER + +mulAvxGFNI_6x7Xor_end: + RET + // func mulGFNI_6x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64(SB), $0-88 @@ -14187,6 +28460,224 @@ mulGFNI_6x8_64_loop: mulGFNI_6x8_64_end: RET +// func mulAvxGFNI_6x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8_loop + VZEROUPPER + +mulAvxGFNI_6x8_end: + RET + // func mulGFNI_6x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x8_64Xor(SB), $0-88 @@ -14405,6 +28896,250 @@ mulGFNI_6x8_64Xor_loop: mulGFNI_6x8_64Xor_end: RET +// func mulAvxGFNI_6x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x8Xor(SB), $0-88 + // Loading 6 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x8Xor_loop: + // Load 8 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x8Xor_loop + VZEROUPPER + +mulAvxGFNI_6x8Xor_end: + RET + // func mulGFNI_6x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64(SB), $0-88 @@ -14609,6 +29344,243 @@ mulGFNI_6x9_64_loop: mulGFNI_6x9_64_end: RET +// func mulAvxGFNI_6x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9_loop + VZEROUPPER + +mulAvxGFNI_6x9_end: + RET + // func mulGFNI_6x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x9_64Xor(SB), $0-88 @@ -14842,6 +29814,272 @@ mulGFNI_6x9_64Xor_loop: mulGFNI_6x9_64Xor_end: RET +// func mulAvxGFNI_6x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x9Xor(SB), $0-88 + // Loading 5 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x9Xor_loop: + // Load 9 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x9Xor_loop + VZEROUPPER + +mulAvxGFNI_6x9Xor_end: + RET + // func mulGFNI_6x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64(SB), $0-88 @@ -15058,6 +30296,262 @@ mulGFNI_6x10_64_loop: mulGFNI_6x10_64_end: RET +// func mulAvxGFNI_6x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10_loop + VZEROUPPER + +mulAvxGFNI_6x10_end: + RET + // func mulGFNI_6x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_6x10_64Xor(SB), $0-88 @@ -15306,6 +30800,294 @@ mulGFNI_6x10_64Xor_loop: mulGFNI_6x10_64Xor_end: RET +// func mulAvxGFNI_6x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_6x10Xor(SB), $0-88 + // Loading 4 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_6x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + +mulAvxGFNI_6x10Xor_loop: + // Load 10 outputs + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y4 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y10 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y11 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y12 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y10, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y11, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y12, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y13, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxGFNI_6x10Xor_loop + VZEROUPPER + +mulAvxGFNI_6x10Xor_end: + RET + // func mulGFNI_7x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64(SB), $0-88 @@ -15403,6 +31185,103 @@ mulGFNI_7x1_64_loop: mulGFNI_7x1_64_end: RET +// func mulAvxGFNI_7x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1_loop + VZEROUPPER + +mulAvxGFNI_7x1_end: + RET + // func mulGFNI_7x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x1_64Xor(SB), $0-88 @@ -15504,6 +31383,107 @@ mulGFNI_7x1_64Xor_loop: mulGFNI_7x1_64Xor_end: RET +// func mulAvxGFNI_7x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), CX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, DX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, CX + +mulAvxGFNI_7x1Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y7 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y6, Y8, Y8 + VXORPD Y7, Y8, Y7 + + // Store 1 outputs + VMOVDQU Y7, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x1Xor_loop + VZEROUPPER + +mulAvxGFNI_7x1Xor_end: + RET + // func mulGFNI_7x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64(SB), $0-88 @@ -15625,6 +31605,127 @@ mulGFNI_7x2_64_loop: mulGFNI_7x2_64_end: RET +// func mulAvxGFNI_7x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2_loop + VZEROUPPER + +mulAvxGFNI_7x2_end: + RET + // func mulGFNI_7x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x2_64Xor(SB), $0-88 @@ -15752,6 +31853,133 @@ mulGFNI_7x2_64Xor_loop: mulGFNI_7x2_64Xor_end: RET +// func mulAvxGFNI_7x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x2Xor(SB), $0-88 + // Loading 12 of 14 tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + +mulAvxGFNI_7x2Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R12) + ADDQ $0x20, R12 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x2Xor_loop + VZEROUPPER + +mulAvxGFNI_7x2Xor_end: + RET + // func mulGFNI_7x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64(SB), $0-88 @@ -15897,6 +32125,151 @@ mulGFNI_7x3_64_loop: mulGFNI_7x3_64_end: RET +// func mulAvxGFNI_7x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3_loop + VZEROUPPER + +mulAvxGFNI_7x3_end: + RET + // func mulGFNI_7x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x3_64Xor(SB), $0-88 @@ -16050,6 +32423,159 @@ mulGFNI_7x3_64Xor_loop: mulGFNI_7x3_64Xor_end: RET +// func mulAvxGFNI_7x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x3Xor(SB), $0-88 + // Loading 11 of 21 tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + +mulAvxGFNI_7x3Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU (R13), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R12) + ADDQ $0x20, R12 + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x3Xor_loop + VZEROUPPER + +mulAvxGFNI_7x3Xor_end: + RET + // func mulGFNI_7x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64(SB), $0-88 @@ -16217,6 +32743,175 @@ mulGFNI_7x4_64_loop: mulGFNI_7x4_64_end: RET +// func mulAvxGFNI_7x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4_loop + VZEROUPPER + +mulAvxGFNI_7x4_end: + RET + // func mulGFNI_7x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x4_64Xor(SB), $0-88 @@ -16394,6 +33089,185 @@ mulGFNI_7x4_64Xor_loop: mulGFNI_7x4_64Xor_end: RET +// func mulAvxGFNI_7x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x4Xor(SB), $0-88 + // Loading 10 of 28 tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + +mulAvxGFNI_7x4Xor_loop: + // Load 4 outputs + VMOVDQU (R12), Y10 + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R12) + ADDQ $0x20, R12 + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x4Xor_loop + VZEROUPPER + +mulAvxGFNI_7x4Xor_end: + RET + // func mulGFNI_7x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64(SB), $8-88 @@ -16577,6 +33451,199 @@ mulGFNI_7x5_64_loop: mulGFNI_7x5_64_end: RET +// func mulAvxGFNI_7x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5_loop + VZEROUPPER + +mulAvxGFNI_7x5_end: + RET + // func mulGFNI_7x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x5_64Xor(SB), $8-88 @@ -16772,6 +33839,211 @@ mulGFNI_7x5_64Xor_loop: mulGFNI_7x5_64Xor_end: RET +// func mulAvxGFNI_7x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x5Xor(SB), $8-88 + // Loading 9 of 35 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + +mulAvxGFNI_7x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_7x5Xor_loop + VZEROUPPER + +mulAvxGFNI_7x5Xor_end: + RET + // func mulGFNI_7x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x6_64(SB), $8-88 @@ -16846,16 +34118,454 @@ TEXT ·mulGFNI_7x6_64(SB), $8-88 MOVQ n+80(FP), BP SHRQ $0x06, BP -mulGFNI_7x6_64_loop: +mulGFNI_7x6_64_loop: + // Load and process 64 bytes from input 0 to 6 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + + // Load and process 64 bytes from input 1 to 6 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 6 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 6 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 6 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 6 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 6 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 6 outputs + VMOVDQU64 Z24, (R11) + ADDQ $0x40, R11 + VMOVDQU64 Z25, (R12) + ADDQ $0x40, R12 + VMOVDQU64 Z26, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_7x6_64_loop + VZEROUPPER + +mulGFNI_7x6_64_end: + RET + +// func mulAvxGFNI_7x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6(SB), $8-88 + // Loading 8 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_7x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_7x6_loop + VZEROUPPER + +mulAvxGFNI_7x6_end: + RET + +// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 + // Loading 24 of 42 tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_7x6_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_7x6_64Xor_loop: + // Load 6 outputs + VMOVDQU64 (R11), Z24 + VMOVDQU64 (R12), Z25 + VMOVDQU64 (R13), Z26 + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R10), Z29 + // Load and process 64 bytes from input 0 to 6 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z24 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z25 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z26 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z24, Z31, Z24 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z25, Z31, Z25 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z26, Z31, Z26 + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 6 outputs VMOVDQU64 (BX), Z30 @@ -16969,64 +34679,48 @@ mulGFNI_7x6_64_loop: // Prepare for next loop DECQ BP - JNZ mulGFNI_7x6_64_loop + JNZ mulGFNI_7x6_64Xor_loop VZEROUPPER -mulGFNI_7x6_64_end: +mulGFNI_7x6_64Xor_end: RET -// func mulGFNI_7x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 - // Loading 24 of 42 tables to registers +// func mulAvxGFNI_7x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x6Xor(SB), $8-88 + // Loading 8 of 42 tables to registers // Destination kept in GP registers // Full registers estimated 50 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_7x6_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ (R10), R11 - MOVQ 24(R10), R12 - MOVQ 48(R10), R13 - MOVQ 72(R10), R14 - MOVQ 96(R10), R15 - MOVQ 120(R10), R10 - MOVQ start+72(FP), BP + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R11 @@ -17047,149 +34741,183 @@ TEXT ·mulGFNI_7x6_64Xor(SB), $8-88 // Reload length to save a register MOVQ n+80(FP), BP - SHRQ $0x06, BP + SHRQ $0x05, BP -mulGFNI_7x6_64Xor_loop: +mulAvxGFNI_7x6Xor_loop: // Load 6 outputs - VMOVDQU64 (R11), Z24 - VMOVDQU64 (R12), Z25 - VMOVDQU64 (R13), Z26 - VMOVDQU64 (R14), Z27 - VMOVDQU64 (R15), Z28 - VMOVDQU64 (R10), Z29 - - // Load and process 64 bytes from input 0 to 6 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 6 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 6 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 6 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 6 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB.BCST $0x00, 192(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 200(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 208(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 6 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB.BCST $0x00, 240(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 248(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 256(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 264(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 272(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 280(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 6 outputs - VMOVDQU64 (AX), Z30 - ADDQ $0x40, AX - VGF2P8AFFINEQB.BCST $0x00, 288(CX), Z30, Z31 - VXORPD Z24, Z31, Z24 - VGF2P8AFFINEQB.BCST $0x00, 296(CX), Z30, Z31 - VXORPD Z25, Z31, Z25 - VGF2P8AFFINEQB.BCST $0x00, 304(CX), Z30, Z31 - VXORPD Z26, Z31, Z26 - VGF2P8AFFINEQB.BCST $0x00, 312(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 320(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 328(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R11), Y8 + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R10), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 6 outputs - VMOVDQU64 Z24, (R11) - ADDQ $0x40, R11 - VMOVDQU64 Z25, (R12) - ADDQ $0x40, R12 - VMOVDQU64 Z26, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z27, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z28, (R15) - ADDQ $0x40, R15 - VMOVDQU64 Z29, (R10) - ADDQ $0x40, R10 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R10) + ADDQ $0x20, R10 // Prepare for next loop DECQ BP - JNZ mulGFNI_7x6_64Xor_loop + JNZ mulAvxGFNI_7x6Xor_loop VZEROUPPER -mulGFNI_7x6_64Xor_end: +mulAvxGFNI_7x6Xor_end: RET // func mulGFNI_7x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -17392,6 +35120,232 @@ mulGFNI_7x7_64_loop: mulGFNI_7x7_64_end: RET +// func mulAvxGFNI_7x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7_loop + VZEROUPPER + +mulAvxGFNI_7x7_end: + RET + // func mulGFNI_7x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x7_64Xor(SB), $0-88 @@ -17615,6 +35569,255 @@ mulGFNI_7x7_64Xor_loop: mulGFNI_7x7_64Xor_end: RET +// func mulAvxGFNI_7x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x7Xor(SB), $0-88 + // Loading 7 of 49 tables to registers + // Destination kept on stack + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x7Xor_loop: + // Load 7 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x7Xor_loop + VZEROUPPER + +mulAvxGFNI_7x7Xor_end: + RET + // func mulGFNI_7x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64(SB), $0-88 @@ -17829,6 +36032,254 @@ mulGFNI_7x8_64_loop: mulGFNI_7x8_64_end: RET +// func mulAvxGFNI_7x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8_loop + VZEROUPPER + +mulAvxGFNI_7x8_end: + RET + // func mulGFNI_7x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x8_64Xor(SB), $0-88 @@ -18069,6 +36520,280 @@ mulGFNI_7x8_64Xor_loop: mulGFNI_7x8_64Xor_end: RET +// func mulAvxGFNI_7x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x8Xor(SB), $0-88 + // Loading 6 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 66 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x8Xor_loop: + // Load 8 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x8Xor_loop + VZEROUPPER + +mulAvxGFNI_7x8Xor_end: + RET + // func mulGFNI_7x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64(SB), $0-88 @@ -18297,6 +37022,276 @@ mulGFNI_7x9_64_loop: mulGFNI_7x9_64_end: RET +// func mulAvxGFNI_7x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9_loop + VZEROUPPER + +mulAvxGFNI_7x9_end: + RET + // func mulGFNI_7x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x9_64Xor(SB), $0-88 @@ -18554,6 +37549,305 @@ mulGFNI_7x9_64Xor_loop: mulGFNI_7x9_64Xor_end: RET +// func mulAvxGFNI_7x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x9Xor(SB), $0-88 + // Loading 5 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x9Xor_loop: + // Load 9 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x9Xor_loop + VZEROUPPER + +mulAvxGFNI_7x9Xor_end: + RET + // func mulGFNI_7x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64(SB), $0-88 @@ -18796,6 +38090,298 @@ mulGFNI_7x10_64_loop: mulGFNI_7x10_64_end: RET +// func mulAvxGFNI_7x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10_loop + VZEROUPPER + +mulAvxGFNI_7x10_end: + RET + // func mulGFNI_7x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_7x10_64Xor(SB), $0-88 @@ -19070,6 +38656,330 @@ mulGFNI_7x10_64Xor_loop: mulGFNI_7x10_64Xor_end: RET +// func mulAvxGFNI_7x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_7x10Xor(SB), $0-88 + // Loading 4 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_7x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + +mulAvxGFNI_7x10Xor_loop: + // Load 10 outputs + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y4 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y10 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y11 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y12 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y10, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y11, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y12, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y13, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxGFNI_7x10Xor_loop + VZEROUPPER + +mulAvxGFNI_7x10Xor_end: + RET + // func mulGFNI_8x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x1_64(SB), $0-88 @@ -19116,11 +39026,227 @@ TEXT ·mulGFNI_8x1_64(SB), $0-88 ADDQ R12, R10 ADDQ R12, CX -mulGFNI_8x1_64_loop: +mulGFNI_8x1_64_loop: + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU64 (DX), Z9 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU64 (BX), Z9 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU64 (SI), Z9 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU64 (DI), Z9 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU64 (R8), Z9 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU64 (R9), Z9 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU64 (R10), Z9 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU64 (CX), Z9 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 + VXORPD Z8, Z9, Z8 + + // Store 1 outputs + VMOVDQU64 Z8, (R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_8x1_64_loop + VZEROUPPER + +mulGFNI_8x1_64_end: + RET + +// func mulAvxGFNI_8x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulAvxGFNI_8x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Store 1 outputs + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x1_loop + VZEROUPPER + +mulAvxGFNI_8x1_end: + RET + +// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_8x1_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, DX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, CX + +mulGFNI_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU64 (R11), Z8 + // Load and process 64 bytes from input 0 to 1 outputs VMOVDQU64 (DX), Z9 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z9, Z8 + VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 + VXORPD Z8, Z9, Z8 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU64 (BX), Z9 @@ -19170,44 +39296,44 @@ mulGFNI_8x1_64_loop: // Prepare for next loop DECQ AX - JNZ mulGFNI_8x1_64_loop + JNZ mulGFNI_8x1_64Xor_loop VZEROUPPER -mulGFNI_8x1_64_end: +mulGFNI_8x1_64Xor_end: RET -// func mulGFNI_8x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 +// func mulAvxGFNI_8x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x1Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers // Full registers estimated 11 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_8x1_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), R9 - MOVQ 144(CX), R10 - MOVQ 168(CX), CX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ (R11), R11 - MOVQ start+72(FP), R12 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), CX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 // Add start offset to output ADDQ R12, R11 @@ -19222,68 +39348,68 @@ TEXT ·mulGFNI_8x1_64Xor(SB), $0-88 ADDQ R12, R10 ADDQ R12, CX -mulGFNI_8x1_64Xor_loop: +mulAvxGFNI_8x1Xor_loop: // Load 1 outputs - VMOVDQU64 (R11), Z8 - - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU64 (DX), Z9 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU64 (BX), Z9 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z1, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU64 (SI), Z9 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z2, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU64 (DI), Z9 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z3, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU64 (R8), Z9 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z4, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU64 (R9), Z9 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z5, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU64 (R10), Z9 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z6, Z9, Z9 - VXORPD Z8, Z9, Z8 - - // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU64 (CX), Z9 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z7, Z9, Z9 - VXORPD Z8, Z9, Z8 + VMOVDQU (R11), Y8 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y9, Y9 + VXORPD Y8, Y9, Y8 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y7, Y9, Y9 + VXORPD Y8, Y9, Y8 // Store 1 outputs - VMOVDQU64 Z8, (R11) - ADDQ $0x40, R11 + VMOVDQU Y8, (R11) + ADDQ $0x20, R11 // Prepare for next loop DECQ AX - JNZ mulGFNI_8x1_64Xor_loop + JNZ mulAvxGFNI_8x1Xor_loop VZEROUPPER -mulGFNI_8x1_64Xor_end: +mulAvxGFNI_8x1Xor_end: RET // func mulGFNI_8x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -19419,6 +39545,139 @@ mulGFNI_8x2_64_loop: mulGFNI_8x2_64_end: RET +// func mulAvxGFNI_8x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2_loop + VZEROUPPER + +mulAvxGFNI_8x2_end: + RET + // func mulGFNI_8x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x2_64Xor(SB), $0-88 @@ -19558,6 +39817,145 @@ mulGFNI_8x2_64Xor_loop: mulGFNI_8x2_64Xor_end: RET +// func mulAvxGFNI_8x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x2Xor(SB), $0-88 + // Loading 12 of 16 tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + +mulAvxGFNI_8x2Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R13) + ADDQ $0x20, R13 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x2Xor_loop + VZEROUPPER + +mulAvxGFNI_8x2Xor_end: + RET + // func mulGFNI_8x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64(SB), $0-88 @@ -19718,6 +40116,166 @@ mulGFNI_8x3_64_loop: mulGFNI_8x3_64_end: RET +// func mulAvxGFNI_8x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3_loop + VZEROUPPER + +mulAvxGFNI_8x3_end: + RET + // func mulGFNI_8x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x3_64Xor(SB), $0-88 @@ -19886,6 +40444,174 @@ mulGFNI_8x3_64Xor_loop: mulGFNI_8x3_64Xor_end: RET +// func mulAvxGFNI_8x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x3Xor(SB), $0-88 + // Loading 11 of 24 tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + +mulAvxGFNI_8x3Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y11 + VMOVDQU (R14), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R13) + ADDQ $0x20, R13 + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x3Xor_loop + VZEROUPPER + +mulAvxGFNI_8x3Xor_end: + RET + // func mulGFNI_8x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64(SB), $8-88 @@ -20067,6 +40793,193 @@ mulGFNI_8x4_64_loop: mulGFNI_8x4_64_end: RET +// func mulAvxGFNI_8x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4_loop + VZEROUPPER + +mulAvxGFNI_8x4_end: + RET + // func mulGFNI_8x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x4_64Xor(SB), $8-88 @@ -20258,6 +41171,203 @@ mulGFNI_8x4_64Xor_loop: mulGFNI_8x4_64Xor_end: RET +// func mulAvxGFNI_8x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x4Xor(SB), $8-88 + // Loading 10 of 32 tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + +mulAvxGFNI_8x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_8x4Xor_loop + VZEROUPPER + +mulAvxGFNI_8x4Xor_end: + RET + // func mulGFNI_8x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64(SB), $8-88 @@ -20461,6 +41571,224 @@ mulGFNI_8x5_64_loop: mulGFNI_8x5_64_end: RET +// func mulAvxGFNI_8x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5_loop + VZEROUPPER + +mulAvxGFNI_8x5_end: + RET + // func mulGFNI_8x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x5_64Xor(SB), $8-88 @@ -20676,6 +42004,236 @@ mulGFNI_8x5_64Xor_loop: mulGFNI_8x5_64Xor_end: RET +// func mulAvxGFNI_8x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x5Xor(SB), $8-88 + // Loading 9 of 40 tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_8x5Xor_loop: + // Load 5 outputs + VMOVDQU (R12), Y9 + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R11), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_8x5Xor_loop + VZEROUPPER + +mulAvxGFNI_8x5Xor_end: + RET + // func mulGFNI_8x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64(SB), $0-88 @@ -20880,6 +42438,234 @@ mulGFNI_8x6_64_loop: mulGFNI_8x6_64_end: RET +// func mulAvxGFNI_8x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6_loop + VZEROUPPER + +mulAvxGFNI_8x6_end: + RET + // func mulGFNI_8x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x6_64Xor(SB), $0-88 @@ -21104,6 +42890,254 @@ mulGFNI_8x6_64Xor_loop: mulGFNI_8x6_64Xor_end: RET +// func mulAvxGFNI_8x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x6Xor(SB), $0-88 + // Loading 8 of 48 tables to registers + // Destination kept on stack + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x6Xor_loop: + // Load 6 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x6Xor_loop + VZEROUPPER + +mulAvxGFNI_8x6Xor_end: + RET + // func mulGFNI_8x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64(SB), $0-88 @@ -21324,6 +43358,259 @@ mulGFNI_8x7_64_loop: mulGFNI_8x7_64_end: RET +// func mulAvxGFNI_8x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7_loop + VZEROUPPER + +mulAvxGFNI_8x7_end: + RET + // func mulGFNI_8x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x7_64Xor(SB), $0-88 @@ -21567,6 +43854,282 @@ mulGFNI_8x7_64Xor_loop: mulGFNI_8x7_64Xor_end: RET +// func mulAvxGFNI_8x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x7Xor(SB), $0-88 + // Loading 7 of 56 tables to registers + // Destination kept on stack + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x7Xor_loop: + // Load 7 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x7Xor_loop + VZEROUPPER + +mulAvxGFNI_8x7Xor_end: + RET + // func mulGFNI_8x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64(SB), $0-88 @@ -21803,6 +44366,284 @@ mulGFNI_8x8_64_loop: mulGFNI_8x8_64_end: RET +// func mulAvxGFNI_8x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8_loop + VZEROUPPER + +mulAvxGFNI_8x8_end: + RET + // func mulGFNI_8x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x8_64Xor(SB), $0-88 @@ -22065,6 +44906,310 @@ mulGFNI_8x8_64Xor_loop: mulGFNI_8x8_64Xor_end: RET +// func mulAvxGFNI_8x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x8Xor(SB), $0-88 + // Loading 6 of 64 tables to registers + // Destination kept on stack + // Full registers estimated 74 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x8Xor_loop: + // Load 8 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x8Xor_loop + VZEROUPPER + +mulAvxGFNI_8x8Xor_end: + RET + // func mulGFNI_8x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64(SB), $0-88 @@ -22317,6 +45462,309 @@ mulGFNI_8x9_64_loop: mulGFNI_8x9_64_end: RET +// func mulAvxGFNI_8x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9_loop + VZEROUPPER + +mulAvxGFNI_8x9_end: + RET + // func mulGFNI_8x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x9_64Xor(SB), $0-88 @@ -22598,6 +46046,338 @@ mulGFNI_8x9_64Xor_loop: mulGFNI_8x9_64Xor_end: RET +// func mulAvxGFNI_8x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x9Xor(SB), $0-88 + // Loading 5 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x9Xor_loop: + // Load 9 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x9Xor_loop + VZEROUPPER + +mulAvxGFNI_8x9Xor_end: + RET + // func mulGFNI_8x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64(SB), $0-88 @@ -22866,6 +46646,334 @@ mulGFNI_8x10_64_loop: mulGFNI_8x10_64_end: RET +// func mulAvxGFNI_8x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10_loop + VZEROUPPER + +mulAvxGFNI_8x10_end: + RET + // func mulGFNI_8x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_8x10_64Xor(SB), $0-88 @@ -23166,6 +47274,366 @@ mulGFNI_8x10_64Xor_loop: mulGFNI_8x10_64Xor_end: RET +// func mulAvxGFNI_8x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_8x10Xor(SB), $0-88 + // Loading 4 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_8x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + +mulAvxGFNI_8x10Xor_loop: + // Load 10 outputs + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y4 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y10 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y11 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y12 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y10, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y11, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y12, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y13, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxGFNI_8x10Xor_loop + VZEROUPPER + +mulAvxGFNI_8x10Xor_end: + RET + // func mulGFNI_9x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64(SB), $0-88 @@ -23281,6 +47749,121 @@ mulGFNI_9x1_64_loop: mulGFNI_9x1_64_end: RET +// func mulAvxGFNI_9x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1_loop + VZEROUPPER + +mulAvxGFNI_9x1_end: + RET + // func mulGFNI_9x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x1_64Xor(SB), $0-88 @@ -23400,6 +47983,125 @@ mulGFNI_9x1_64Xor_loop: mulGFNI_9x1_64Xor_end: RET +// func mulAvxGFNI_9x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, DX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, CX + +mulAvxGFNI_9x1Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y9 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y8, Y10, Y10 + VXORPD Y9, Y10, Y9 + + // Store 1 outputs + VMOVDQU Y9, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x1Xor_loop + VZEROUPPER + +mulAvxGFNI_9x1Xor_end: + RET + // func mulGFNI_9x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64(SB), $0-88 @@ -23545,6 +48247,151 @@ mulGFNI_9x2_64_loop: mulGFNI_9x2_64_end: RET +// func mulAvxGFNI_9x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2_loop + VZEROUPPER + +mulAvxGFNI_9x2_end: + RET + // func mulGFNI_9x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x2_64Xor(SB), $0-88 @@ -23696,6 +48543,157 @@ mulGFNI_9x2_64Xor_loop: mulGFNI_9x2_64Xor_end: RET +// func mulAvxGFNI_9x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x2Xor(SB), $0-88 + // Loading 12 of 18 tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + +mulAvxGFNI_9x2Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R14) + ADDQ $0x20, R14 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x2Xor_loop + VZEROUPPER + +mulAvxGFNI_9x2Xor_end: + RET + // func mulGFNI_9x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x3_64(SB), $0-88 @@ -23767,13 +48765,371 @@ TEXT ·mulGFNI_9x3_64(SB), $0-88 ADDQ R15, R11 ADDQ R15, CX -mulGFNI_9x3_64_loop: +mulGFNI_9x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (CX), Z30 + ADDQ $0x40, CX + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R13) + ADDQ $0x40, R13 + VMOVDQU64 Z28, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z29, (R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulGFNI_9x3_64_loop + VZEROUPPER + +mulGFNI_9x3_64_end: + RET + +// func mulAvxGFNI_9x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3(SB), $8-88 + // Loading 11 of 27 tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + +mulAvxGFNI_9x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_9x3_loop + VZEROUPPER + +mulAvxGFNI_9x3_end: + RET + +// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_9x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), CX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, DX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, CX + +mulGFNI_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R13), Z27 + VMOVDQU64 (R14), Z28 + VMOVDQU64 (R12), Z29 + // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 @@ -23865,193 +49221,193 @@ mulGFNI_9x3_64_loop: // Prepare for next loop DECQ AX - JNZ mulGFNI_9x3_64_loop + JNZ mulGFNI_9x3_64Xor_loop VZEROUPPER -mulGFNI_9x3_64_end: +mulGFNI_9x3_64Xor_end: RET -// func mulGFNI_9x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_9x3_64Xor(SB), $0-88 - // Loading all tables to registers +// func mulAvxGFNI_9x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x3Xor(SB), $8-88 + // Loading 11 of 27 tables to registers // Destination kept in GP registers // Full registers estimated 32 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_9x3_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - VBROADCASTF32X2 192(CX), Z24 - VBROADCASTF32X2 200(CX), Z25 - VBROADCASTF32X2 208(CX), Z26 - MOVQ in_base+24(FP), CX - MOVQ (CX), DX - MOVQ 24(CX), BX - MOVQ 48(CX), SI - MOVQ 72(CX), DI - MOVQ 96(CX), R8 - MOVQ 120(CX), R9 - MOVQ 144(CX), R10 - MOVQ 168(CX), R11 - MOVQ 192(CX), CX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ (R12), R13 - MOVQ 24(R12), R14 - MOVQ 48(R12), R12 - MOVQ start+72(FP), R15 + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output - ADDQ R15, R13 - ADDQ R15, R14 - ADDQ R15, R12 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R15, DX - ADDQ R15, BX - ADDQ R15, SI - ADDQ R15, DI - ADDQ R15, R8 - ADDQ R15, R9 - ADDQ R15, R10 - ADDQ R15, R11 - ADDQ R15, CX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX -mulGFNI_9x3_64Xor_loop: +mulAvxGFNI_9x3Xor_loop: // Load 3 outputs - VMOVDQU64 (R13), Z27 - VMOVDQU64 (R14), Z28 - VMOVDQU64 (R12), Z29 - - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU64 (R10), Z30 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU64 (R11), Z30 - ADDQ $0x40, R11 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU64 (CX), Z30 - ADDQ $0x40, CX - VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU64 Z27, (R13) - ADDQ $0x40, R13 - VMOVDQU64 Z28, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z29, (R12) - ADDQ $0x40, R12 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ AX - JNZ mulGFNI_9x3_64Xor_loop + JNZ mulAvxGFNI_9x3Xor_loop VZEROUPPER -mulGFNI_9x3_64Xor_end: +mulAvxGFNI_9x3Xor_end: RET // func mulGFNI_9x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -24253,6 +49609,215 @@ mulGFNI_9x4_64_loop: mulGFNI_9x4_64_end: RET +// func mulAvxGFNI_9x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4_loop + VZEROUPPER + +mulAvxGFNI_9x4_end: + RET + // func mulGFNI_9x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x4_64Xor(SB), $8-88 @@ -24462,6 +50027,225 @@ mulGFNI_9x4_64Xor_loop: mulGFNI_9x4_64Xor_end: RET +// func mulAvxGFNI_9x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x4Xor(SB), $8-88 + // Loading 10 of 36 tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_9x4Xor_loop: + // Load 4 outputs + VMOVDQU (R13), Y10 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R12), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_9x4Xor_loop + VZEROUPPER + +mulAvxGFNI_9x4Xor_end: + RET + // func mulGFNI_9x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64(SB), $0-88 @@ -24666,6 +50450,230 @@ mulGFNI_9x5_64_loop: mulGFNI_9x5_64_end: RET +// func mulAvxGFNI_9x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5_loop + VZEROUPPER + +mulAvxGFNI_9x5_end: + RET + // func mulGFNI_9x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x5_64Xor(SB), $0-88 @@ -24887,6 +50895,247 @@ mulGFNI_9x5_64Xor_loop: mulGFNI_9x5_64Xor_end: RET +// func mulAvxGFNI_9x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x5Xor(SB), $0-88 + // Loading 9 of 45 tables to registers + // Destination kept on stack + // Full registers estimated 52 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x5Xor_loop: + // Load 5 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x5Xor_loop + VZEROUPPER + +mulAvxGFNI_9x5Xor_end: + RET + // func mulGFNI_9x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64(SB), $0-88 @@ -25109,6 +51358,258 @@ mulGFNI_9x6_64_loop: mulGFNI_9x6_64_end: RET +// func mulAvxGFNI_9x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6_loop + VZEROUPPER + +mulAvxGFNI_9x6_end: + RET + // func mulGFNI_9x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x6_64Xor(SB), $0-88 @@ -25351,6 +51852,278 @@ mulGFNI_9x6_64Xor_loop: mulGFNI_9x6_64Xor_end: RET +// func mulAvxGFNI_9x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x6Xor(SB), $0-88 + // Loading 8 of 54 tables to registers + // Destination kept on stack + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x6Xor_loop: + // Load 6 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x6Xor_loop + VZEROUPPER + +mulAvxGFNI_9x6Xor_end: + RET + // func mulGFNI_9x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64(SB), $0-88 @@ -25591,6 +52364,286 @@ mulGFNI_9x7_64_loop: mulGFNI_9x7_64_end: RET +// func mulAvxGFNI_9x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7_loop + VZEROUPPER + +mulAvxGFNI_9x7_end: + RET + // func mulGFNI_9x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x7_64Xor(SB), $0-88 @@ -25854,6 +52907,309 @@ mulGFNI_9x7_64Xor_loop: mulGFNI_9x7_64Xor_end: RET +// func mulAvxGFNI_9x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x7Xor(SB), $0-88 + // Loading 7 of 63 tables to registers + // Destination kept on stack + // Full registers estimated 72 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x7Xor_loop: + // Load 7 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x7Xor_loop + VZEROUPPER + +mulAvxGFNI_9x7Xor_end: + RET + // func mulGFNI_9x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64(SB), $0-88 @@ -26112,6 +53468,314 @@ mulGFNI_9x8_64_loop: mulGFNI_9x8_64_end: RET +// func mulAvxGFNI_9x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8_loop + VZEROUPPER + +mulAvxGFNI_9x8_end: + RET + // func mulGFNI_9x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x8_64Xor(SB), $0-88 @@ -26396,6 +54060,340 @@ mulGFNI_9x8_64Xor_loop: mulGFNI_9x8_64Xor_end: RET +// func mulAvxGFNI_9x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x8Xor(SB), $0-88 + // Loading 6 of 72 tables to registers + // Destination kept on stack + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x8Xor_loop: + // Load 8 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x8Xor_loop + VZEROUPPER + +mulAvxGFNI_9x8Xor_end: + RET + // func mulGFNI_9x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64(SB), $0-88 @@ -26672,6 +54670,342 @@ mulGFNI_9x9_64_loop: mulGFNI_9x9_64_end: RET +// func mulAvxGFNI_9x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9_loop + VZEROUPPER + +mulAvxGFNI_9x9_end: + RET + // func mulGFNI_9x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x9_64Xor(SB), $0-88 @@ -26977,6 +55311,371 @@ mulGFNI_9x9_64Xor_loop: mulGFNI_9x9_64Xor_end: RET +// func mulAvxGFNI_9x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x9Xor(SB), $0-88 + // Loading 5 of 81 tables to registers + // Destination kept on stack + // Full registers estimated 92 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x9Xor_loop: + // Load 9 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x9Xor_loop + VZEROUPPER + +mulAvxGFNI_9x9Xor_end: + RET + // func mulGFNI_9x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64(SB), $0-88 @@ -27271,6 +55970,370 @@ mulGFNI_9x10_64_loop: mulGFNI_9x10_64_end: RET +// func mulAvxGFNI_9x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10_loop + VZEROUPPER + +mulAvxGFNI_9x10_end: + RET + // func mulGFNI_9x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_9x10_64Xor(SB), $0-88 @@ -27597,6 +56660,402 @@ mulGFNI_9x10_64Xor_loop: mulGFNI_9x10_64Xor_end: RET +// func mulAvxGFNI_9x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_9x10Xor(SB), $0-88 + // Loading 4 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 102 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_9x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + +mulAvxGFNI_9x10Xor_loop: + // Load 10 outputs + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y4 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y10 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y11 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y12 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y10, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y11, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y12, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y13, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxGFNI_9x10Xor_loop + VZEROUPPER + +mulAvxGFNI_9x10Xor_end: + RET + // func mulGFNI_10x1_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64(SB), $0-88 @@ -27721,6 +57180,130 @@ mulGFNI_10x1_64_loop: mulGFNI_10x1_64_end: RET +// func mulAvxGFNI_10x1(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1_loop + VZEROUPPER + +mulAvxGFNI_10x1_end: + RET + // func mulGFNI_10x1_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x1_64Xor(SB), $0-88 @@ -27849,6 +57432,134 @@ mulGFNI_10x1_64Xor_loop: mulGFNI_10x1_64Xor_end: RET +// func mulAvxGFNI_10x1Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x1Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 13 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x1Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), R9 + MOVQ 144(CX), R10 + MOVQ 168(CX), R11 + MOVQ 192(CX), R12 + MOVQ 216(CX), CX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, DX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, CX + +mulAvxGFNI_10x1Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y10 + + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y1, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y3, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y4, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y5, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y6, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VGF2P8AFFINEQB $0x00, Y7, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VGF2P8AFFINEQB $0x00, Y8, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (CX), Y11 + ADDQ $0x20, CX + VGF2P8AFFINEQB $0x00, Y9, Y11, Y11 + VXORPD Y10, Y11, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x1Xor_loop + VZEROUPPER + +mulAvxGFNI_10x1Xor_end: + RET + // func mulGFNI_10x2_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64(SB), $0-88 @@ -28006,6 +57717,163 @@ mulGFNI_10x2_64_loop: mulGFNI_10x2_64_end: RET +// func mulAvxGFNI_10x2(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2_loop + VZEROUPPER + +mulAvxGFNI_10x2_end: + RET + // func mulGFNI_10x2_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x2_64Xor(SB), $0-88 @@ -28169,6 +58037,169 @@ mulGFNI_10x2_64Xor_loop: mulGFNI_10x2_64Xor_end: RET +// func mulAvxGFNI_10x2Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x2Xor(SB), $8-88 + // Loading 12 of 20 tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x2Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + VBROADCASTSD 88(CX), Y11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + +mulAvxGFNI_10x2Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y12 + VMOVDQU (R14), Y13 + + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 2 outputs + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxGFNI_10x2Xor_loop + VZEROUPPER + +mulAvxGFNI_10x2Xor_end: + RET + // func mulGFNI_10x3_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x3_64(SB), $8-88 @@ -28246,13 +58277,406 @@ TEXT ·mulGFNI_10x3_64(SB), $8-88 MOVQ n+80(FP), BP SHRQ $0x06, BP -mulGFNI_10x3_64_loop: +mulGFNI_10x3_64_loop: + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU64 (DX), Z30 + ADDQ $0x40, DX + VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU64 (BX), Z30 + ADDQ $0x40, BX + VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU64 (SI), Z30 + ADDQ $0x40, SI + VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU64 (DI), Z30 + ADDQ $0x40, DI + VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU64 (R8), Z30 + ADDQ $0x40, R8 + VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU64 (R9), Z30 + ADDQ $0x40, R9 + VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU64 (R10), Z30 + ADDQ $0x40, R10 + VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU64 (R11), Z30 + ADDQ $0x40, R11 + VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU64 (R12), Z30 + ADDQ $0x40, R12 + VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU64 (AX), Z30 + ADDQ $0x40, AX + VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 + VXORPD Z29, Z31, Z29 + + // Store 3 outputs + VMOVDQU64 Z27, (R14) + ADDQ $0x40, R14 + VMOVDQU64 Z28, (R15) + ADDQ $0x40, R15 + VMOVDQU64 Z29, (R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulGFNI_10x3_64_loop + VZEROUPPER + +mulGFNI_10x3_64_end: + RET + +// func mulAvxGFNI_10x3(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3(SB), $8-88 + // Loading 11 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxGFNI_10x3_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 3 outputs + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxGFNI_10x3_loop + VZEROUPPER + +mulAvxGFNI_10x3_end: + RET + +// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX512DQ, AVX512F, GFNI +TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 + // Loading 27 of 30 tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulGFNI_10x3_64Xor_end + VBROADCASTF32X2 (CX), Z0 + VBROADCASTF32X2 8(CX), Z1 + VBROADCASTF32X2 16(CX), Z2 + VBROADCASTF32X2 24(CX), Z3 + VBROADCASTF32X2 32(CX), Z4 + VBROADCASTF32X2 40(CX), Z5 + VBROADCASTF32X2 48(CX), Z6 + VBROADCASTF32X2 56(CX), Z7 + VBROADCASTF32X2 64(CX), Z8 + VBROADCASTF32X2 72(CX), Z9 + VBROADCASTF32X2 80(CX), Z10 + VBROADCASTF32X2 88(CX), Z11 + VBROADCASTF32X2 96(CX), Z12 + VBROADCASTF32X2 104(CX), Z13 + VBROADCASTF32X2 112(CX), Z14 + VBROADCASTF32X2 120(CX), Z15 + VBROADCASTF32X2 128(CX), Z16 + VBROADCASTF32X2 136(CX), Z17 + VBROADCASTF32X2 144(CX), Z18 + VBROADCASTF32X2 152(CX), Z19 + VBROADCASTF32X2 160(CX), Z20 + VBROADCASTF32X2 168(CX), Z21 + VBROADCASTF32X2 176(CX), Z22 + VBROADCASTF32X2 184(CX), Z23 + VBROADCASTF32X2 192(CX), Z24 + VBROADCASTF32X2 200(CX), Z25 + VBROADCASTF32X2 208(CX), Z26 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulGFNI_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU64 (R14), Z27 + VMOVDQU64 (R15), Z28 + VMOVDQU64 (R13), Z29 + // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU64 (DX), Z30 ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z29 + VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 + VXORPD Z27, Z31, Z27 + VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 + VXORPD Z28, Z31, Z28 + VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 + VXORPD Z29, Z31, Z29 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU64 (BX), Z30 @@ -28354,67 +58778,51 @@ mulGFNI_10x3_64_loop: // Prepare for next loop DECQ BP - JNZ mulGFNI_10x3_64_loop + JNZ mulGFNI_10x3_64Xor_loop VZEROUPPER -mulGFNI_10x3_64_end: +mulGFNI_10x3_64Xor_end: RET -// func mulGFNI_10x3_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX512DQ, AVX512F, GFNI -TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 - // Loading 27 of 30 tables to registers +// func mulAvxGFNI_10x3Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x3Xor(SB), $8-88 + // Loading 11 of 30 tables to registers // Destination kept in GP registers // Full registers estimated 35 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulGFNI_10x3_64Xor_end - VBROADCASTF32X2 (CX), Z0 - VBROADCASTF32X2 8(CX), Z1 - VBROADCASTF32X2 16(CX), Z2 - VBROADCASTF32X2 24(CX), Z3 - VBROADCASTF32X2 32(CX), Z4 - VBROADCASTF32X2 40(CX), Z5 - VBROADCASTF32X2 48(CX), Z6 - VBROADCASTF32X2 56(CX), Z7 - VBROADCASTF32X2 64(CX), Z8 - VBROADCASTF32X2 72(CX), Z9 - VBROADCASTF32X2 80(CX), Z10 - VBROADCASTF32X2 88(CX), Z11 - VBROADCASTF32X2 96(CX), Z12 - VBROADCASTF32X2 104(CX), Z13 - VBROADCASTF32X2 112(CX), Z14 - VBROADCASTF32X2 120(CX), Z15 - VBROADCASTF32X2 128(CX), Z16 - VBROADCASTF32X2 136(CX), Z17 - VBROADCASTF32X2 144(CX), Z18 - VBROADCASTF32X2 152(CX), Z19 - VBROADCASTF32X2 160(CX), Z20 - VBROADCASTF32X2 168(CX), Z21 - VBROADCASTF32X2 176(CX), Z22 - VBROADCASTF32X2 184(CX), Z23 - VBROADCASTF32X2 192(CX), Z24 - VBROADCASTF32X2 200(CX), Z25 - VBROADCASTF32X2 208(CX), Z26 - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ (R13), R14 - MOVQ 24(R13), R15 - MOVQ 48(R13), R13 - MOVQ start+72(FP), BP + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x3Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + VBROADCASTSD 80(CX), Y10 + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP // Add start offset to output ADDQ BP, R14 @@ -28435,128 +58843,147 @@ TEXT ·mulGFNI_10x3_64Xor(SB), $8-88 // Reload length to save a register MOVQ n+80(FP), BP - SHRQ $0x06, BP + SHRQ $0x05, BP -mulGFNI_10x3_64Xor_loop: +mulAvxGFNI_10x3Xor_loop: // Load 3 outputs - VMOVDQU64 (R14), Z27 - VMOVDQU64 (R15), Z28 - VMOVDQU64 (R13), Z29 - - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU64 (DX), Z30 - ADDQ $0x40, DX - VGF2P8AFFINEQB $0x00, Z0, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z1, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z2, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU64 (BX), Z30 - ADDQ $0x40, BX - VGF2P8AFFINEQB $0x00, Z3, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z4, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z5, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU64 (SI), Z30 - ADDQ $0x40, SI - VGF2P8AFFINEQB $0x00, Z6, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z7, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z8, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU64 (DI), Z30 - ADDQ $0x40, DI - VGF2P8AFFINEQB $0x00, Z9, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z10, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z11, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU64 (R8), Z30 - ADDQ $0x40, R8 - VGF2P8AFFINEQB $0x00, Z12, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z13, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z14, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU64 (R9), Z30 - ADDQ $0x40, R9 - VGF2P8AFFINEQB $0x00, Z15, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z16, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z17, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU64 (R10), Z30 - ADDQ $0x40, R10 - VGF2P8AFFINEQB $0x00, Z18, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z19, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z20, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU64 (R11), Z30 - ADDQ $0x40, R11 - VGF2P8AFFINEQB $0x00, Z21, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z22, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z23, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU64 (R12), Z30 - ADDQ $0x40, R12 - VGF2P8AFFINEQB $0x00, Z24, Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB $0x00, Z25, Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB $0x00, Z26, Z30, Z31 - VXORPD Z29, Z31, Z29 - - // Load and process 64 bytes from input 9 to 3 outputs - VMOVDQU64 (AX), Z30 - ADDQ $0x40, AX - VGF2P8AFFINEQB.BCST $0x00, 216(CX), Z30, Z31 - VXORPD Z27, Z31, Z27 - VGF2P8AFFINEQB.BCST $0x00, 224(CX), Z30, Z31 - VXORPD Z28, Z31, Z28 - VGF2P8AFFINEQB.BCST $0x00, 232(CX), Z30, Z31 - VXORPD Z29, Z31, Z29 + VMOVDQU (R14), Y11 + VMOVDQU (R15), Y12 + VMOVDQU (R13), Y13 + + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y14 + ADDQ $0x20, AX + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 // Store 3 outputs - VMOVDQU64 Z27, (R14) - ADDQ $0x40, R14 - VMOVDQU64 Z28, (R15) - ADDQ $0x40, R15 - VMOVDQU64 Z29, (R13) - ADDQ $0x40, R13 + VMOVDQU Y11, (R14) + ADDQ $0x20, R14 + VMOVDQU Y12, (R15) + ADDQ $0x20, R15 + VMOVDQU Y13, (R13) + ADDQ $0x20, R13 // Prepare for next loop DECQ BP - JNZ mulGFNI_10x3_64Xor_loop + JNZ mulAvxGFNI_10x3Xor_loop VZEROUPPER -mulGFNI_10x3_64Xor_end: +mulAvxGFNI_10x3Xor_end: RET // func mulGFNI_10x4_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) @@ -28759,6 +59186,220 @@ mulGFNI_10x4_64_loop: mulGFNI_10x4_64_end: RET +// func mulAvxGFNI_10x4(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4_loop + VZEROUPPER + +mulAvxGFNI_10x4_end: + RET + // func mulGFNI_10x4_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x4_64Xor(SB), $8-88 @@ -28973,6 +59614,234 @@ mulGFNI_10x4_64Xor_loop: mulGFNI_10x4_64Xor_end: RET +// func mulAvxGFNI_10x4Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x4Xor(SB), $8-88 + // Loading 10 of 40 tables to registers + // Destination kept on stack + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x4Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + VBROADCASTSD 72(CX), Y9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x4Xor_loop: + // Load 4 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x4Xor_loop + VZEROUPPER + +mulAvxGFNI_10x4Xor_end: + RET + // func mulGFNI_10x5_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64(SB), $8-88 @@ -29193,6 +60062,251 @@ mulGFNI_10x5_64_loop: mulGFNI_10x5_64_end: RET +// func mulAvxGFNI_10x5(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5_loop + VZEROUPPER + +mulAvxGFNI_10x5_end: + RET + // func mulGFNI_10x5_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x5_64Xor(SB), $8-88 @@ -29430,6 +60544,268 @@ mulGFNI_10x5_64Xor_loop: mulGFNI_10x5_64Xor_end: RET +// func mulAvxGFNI_10x5Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x5Xor(SB), $8-88 + // Loading 9 of 50 tables to registers + // Destination kept on stack + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x5Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + VBROADCASTSD 64(CX), Y8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x5Xor_loop: + // Load 5 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x5Xor_loop + VZEROUPPER + +mulAvxGFNI_10x5Xor_end: + RET + // func mulGFNI_10x6_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64(SB), $8-88 @@ -29670,6 +61046,282 @@ mulGFNI_10x6_64_loop: mulGFNI_10x6_64_end: RET +// func mulAvxGFNI_10x6(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6_loop + VZEROUPPER + +mulAvxGFNI_10x6_end: + RET + // func mulGFNI_10x6_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x6_64Xor(SB), $8-88 @@ -29930,6 +61582,302 @@ mulGFNI_10x6_64Xor_loop: mulGFNI_10x6_64Xor_end: RET +// func mulAvxGFNI_10x6Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x6Xor(SB), $8-88 + // Loading 8 of 60 tables to registers + // Destination kept on stack + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x6Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + VBROADCASTSD 56(CX), Y7 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x6Xor_loop: + // Load 6 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y7, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x6Xor_loop + VZEROUPPER + +mulAvxGFNI_10x6Xor_end: + RET + // func mulGFNI_10x7_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64(SB), $8-88 @@ -30190,6 +62138,313 @@ mulGFNI_10x7_64_loop: mulGFNI_10x7_64_end: RET +// func mulAvxGFNI_10x7(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7_loop + VZEROUPPER + +mulAvxGFNI_10x7_end: + RET + // func mulGFNI_10x7_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x7_64Xor(SB), $8-88 @@ -30473,6 +62728,336 @@ mulGFNI_10x7_64Xor_loop: mulGFNI_10x7_64Xor_end: RET +// func mulAvxGFNI_10x7Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x7Xor(SB), $8-88 + // Loading 7 of 70 tables to registers + // Destination kept on stack + // Full registers estimated 79 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x7Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + VBROADCASTSD 48(CX), Y6 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x7Xor_loop: + // Load 7 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y11, Y15, Y11 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y12, Y15, Y12 + VGF2P8AFFINEQB $0x00, Y6, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x7Xor_loop + VZEROUPPER + +mulAvxGFNI_10x7Xor_end: + RET + // func mulGFNI_10x8_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64(SB), $8-88 @@ -30753,6 +63338,344 @@ mulGFNI_10x8_64_loop: mulGFNI_10x8_64_end: RET +// func mulAvxGFNI_10x8(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y11 + VBROADCASTSD 48(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 56(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8_loop + VZEROUPPER + +mulAvxGFNI_10x8_end: + RET + // func mulGFNI_10x8_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x8_64Xor(SB), $8-88 @@ -31059,6 +63982,370 @@ mulGFNI_10x8_64Xor_loop: mulGFNI_10x8_64Xor_end: RET +// func mulAvxGFNI_10x8Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x8Xor(SB), $8-88 + // Loading 6 of 80 tables to registers + // Destination kept on stack + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x8Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + VBROADCASTSD 40(CX), Y5 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x8Xor_loop: + // Load 8 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y9, Y15, Y9 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y10, Y15, Y10 + VGF2P8AFFINEQB $0x00, Y5, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x8Xor_loop + VZEROUPPER + +mulAvxGFNI_10x8Xor_end: + RET + // func mulGFNI_10x9_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64(SB), $8-88 @@ -31359,6 +64646,375 @@ mulGFNI_10x9_64_loop: mulGFNI_10x9_64_end: RET +// func mulAvxGFNI_10x9(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y9 + VBROADCASTSD 40(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 48(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 56(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 64(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9_loop + VZEROUPPER + +mulAvxGFNI_10x9_end: + RET + // func mulGFNI_10x9_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x9_64Xor(SB), $8-88 @@ -31688,6 +65344,404 @@ mulGFNI_10x9_64Xor_loop: mulGFNI_10x9_64Xor_end: RET +// func mulAvxGFNI_10x9Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x9Xor(SB), $8-88 + // Loading 5 of 90 tables to registers + // Destination kept on stack + // Full registers estimated 101 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x9Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + VBROADCASTSD 32(CX), Y4 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x9Xor_loop: + // Load 9 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y7, Y15, Y7 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y8, Y15, Y8 + VGF2P8AFFINEQB $0x00, Y4, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x9Xor_loop + VZEROUPPER + +mulAvxGFNI_10x9Xor_end: + RET + // func mulGFNI_10x10_64(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64(SB), $8-88 @@ -32008,6 +66062,406 @@ mulGFNI_10x10_64_loop: mulGFNI_10x10_64_end: RET +// func mulAvxGFNI_10x10(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y7 + VBROADCASTSD 32(CX), Y8 + VGF2P8AFFINEQB $0x00, Y8, Y14, Y8 + VBROADCASTSD 40(CX), Y9 + VGF2P8AFFINEQB $0x00, Y9, Y14, Y9 + VBROADCASTSD 48(CX), Y10 + VGF2P8AFFINEQB $0x00, Y10, Y14, Y10 + VBROADCASTSD 56(CX), Y11 + VGF2P8AFFINEQB $0x00, Y11, Y14, Y11 + VBROADCASTSD 64(CX), Y12 + VGF2P8AFFINEQB $0x00, Y12, Y14, Y12 + VBROADCASTSD 72(CX), Y13 + VGF2P8AFFINEQB $0x00, Y13, Y14, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10_loop + VZEROUPPER + +mulAvxGFNI_10x10_end: + RET + // func mulGFNI_10x10_64Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·mulGFNI_10x10_64Xor(SB), $8-88 @@ -32360,6 +66814,438 @@ mulGFNI_10x10_64Xor_loop: mulGFNI_10x10_64Xor_end: RET +// func mulAvxGFNI_10x10Xor(matrix []uint64, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, GFNI +TEXT ·mulAvxGFNI_10x10Xor(SB), $8-88 + // Loading 4 of 100 tables to registers + // Destination kept on stack + // Full registers estimated 112 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxGFNI_10x10Xor_end + VBROADCASTSD (CX), Y0 + VBROADCASTSD 8(CX), Y1 + VBROADCASTSD 16(CX), Y2 + VBROADCASTSD 24(CX), Y3 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + +mulAvxGFNI_10x10Xor_loop: + // Load 10 outputs + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y4 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y5 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y6 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y7 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y8 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y9 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y10 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y11 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y12 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y13 + + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VGF2P8AFFINEQB $0x00, Y0, Y14, Y15 + VXORPD Y4, Y15, Y4 + VGF2P8AFFINEQB $0x00, Y1, Y14, Y15 + VXORPD Y5, Y15, Y5 + VGF2P8AFFINEQB $0x00, Y2, Y14, Y15 + VXORPD Y6, Y15, Y6 + VGF2P8AFFINEQB $0x00, Y3, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 32(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 40(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 48(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 56(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 64(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 72(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VBROADCASTSD 80(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 88(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 96(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 104(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 112(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 120(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 128(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 136(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 144(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 152(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VBROADCASTSD 160(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 168(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 176(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 184(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 192(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 200(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 208(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 216(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 224(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 232(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VBROADCASTSD 240(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 248(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 256(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 264(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 272(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 280(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 288(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 296(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 304(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 312(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y14 + ADDQ $0x20, R9 + VBROADCASTSD 320(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 328(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 336(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 344(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 352(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 360(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 368(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 376(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 384(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 392(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y14 + ADDQ $0x20, R10 + VBROADCASTSD 400(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 408(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 416(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 424(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 432(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 440(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 448(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 456(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 464(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 472(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y14 + ADDQ $0x20, R11 + VBROADCASTSD 480(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 488(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 496(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 504(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 512(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 520(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 528(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 536(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 544(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 552(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y14 + ADDQ $0x20, R12 + VBROADCASTSD 560(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 568(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 576(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 584(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 592(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 600(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 608(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 616(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 624(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 632(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y14 + ADDQ $0x20, R13 + VBROADCASTSD 640(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 648(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 656(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 664(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 672(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 680(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 688(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 696(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 704(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 712(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VBROADCASTSD 720(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y4, Y15, Y4 + VBROADCASTSD 728(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y5, Y15, Y5 + VBROADCASTSD 736(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y6, Y15, Y6 + VBROADCASTSD 744(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y7, Y15, Y7 + VBROADCASTSD 752(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y8, Y15, Y8 + VBROADCASTSD 760(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y9, Y15, Y9 + VBROADCASTSD 768(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y10, Y15, Y10 + VBROADCASTSD 776(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y11, Y15, Y11 + VBROADCASTSD 784(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y12, Y15, Y12 + VBROADCASTSD 792(CX), Y15 + VGF2P8AFFINEQB $0x00, Y15, Y14, Y15 + VXORPD Y13, Y15, Y13 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y9, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y10, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y11, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y12, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y13, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxGFNI_10x10Xor_loop + VZEROUPPER + +mulAvxGFNI_10x10Xor_end: + RET + // func ifftDIT48_gfni_0(work [][]byte, dist int, t01 uint64, t23 uint64, t02 uint64) // Requires: AVX, AVX512DQ, AVX512F, GFNI TEXT ·ifftDIT48_gfni_0(SB), NOSPLIT, $0-56 diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index 28c50658..429e2c20 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -14,687 +14,686 @@ const ( maxAvx2Inputs = 10 maxAvx2Outputs = 10 minAvx2Size = 64 - avxSizeMask = maxInt - (minAvx2Size - 1) ) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := stop - start switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) switch len(in) { case 1: switch len(out) { case 1: mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_1x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_1x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_1x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_1x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_1x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_1x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_1x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 2: switch len(out) { case 1: mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_2x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_2x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_2x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_2x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_2x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_2x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_2x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 3: switch len(out) { case 1: mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_3x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_3x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_3x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_3x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_3x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_3x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_3x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 4: switch len(out) { case 1: mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_4x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_4x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_4x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_4x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_4x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_4x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_4x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 5: switch len(out) { case 1: mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_5x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_5x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_5x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_5x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_5x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_5x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_5x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 6: switch len(out) { case 1: mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_6x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_6x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_6x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_6x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_6x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_6x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_6x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 7: switch len(out) { case 1: mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_7x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_7x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_7x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_7x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_7x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_7x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_7x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 8: switch len(out) { case 1: mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_8x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_8x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_8x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_8x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_8x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_8x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_8x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 9: switch len(out) { case 1: mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_9x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_9x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_9x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_9x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_9x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_9x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_9x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } case 10: switch len(out) { case 1: mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 2: mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 3: mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 63) case 4: mulAvxTwo_10x4Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 5: mulAvxTwo_10x5Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 6: mulAvxTwo_10x6Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 7: mulAvxTwo_10x7Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 8: mulAvxTwo_10x8Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 9: mulAvxTwo_10x9Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) case 10: mulAvxTwo_10x10Xor(matrix, in, out, start, n) - return n + return n & (maxInt - 31) } } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -1032,7 +1031,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -1368,3 +1367,679 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_gen_switch_nopshufb_amd64.go b/galois_gen_switch_nopshufb_amd64.go index 888df307..1ba08b5e 100644 --- a/galois_gen_switch_nopshufb_amd64.go +++ b/galois_gen_switch_nopshufb_amd64.go @@ -14,14 +14,13 @@ const ( maxAvx2Inputs = 10 maxAvx2Outputs = 10 minAvx2Size = 64 - avxSizeMask = maxInt - (minAvx2Size - 1) ) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(`no pshufb`) } func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -359,7 +358,7 @@ func galMulSlicesGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { } func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { - n := (stop - start) & avxSizeMask + n := (stop - start) & (maxInt - (64 - 1)) switch len(in) { case 1: @@ -695,3 +694,679 @@ func galMulSlicesGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvxGFNI(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} + +func galMulSlicesAvxGFNIXor(matrix []uint64, in, out [][]byte, start, stop int) int { + n := (stop - start) & (maxInt - (32 - 1)) + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxGFNI_1x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_1x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_1x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxGFNI_2x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_2x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_2x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxGFNI_3x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_3x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_3x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxGFNI_4x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_4x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_4x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxGFNI_5x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_5x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_5x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxGFNI_6x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_6x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_6x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxGFNI_7x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_7x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_7x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxGFNI_8x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_8x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_8x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxGFNI_9x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_9x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_9x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxGFNI_10x1Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxGFNI_10x2Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxGFNI_10x3Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxGFNI_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxGFNI_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxGFNI_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxGFNI_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxGFNI_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxGFNI_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxGFNI_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/options.go b/options.go index f74fe00f..73cc7d6d 100644 --- a/options.go +++ b/options.go @@ -2,6 +2,7 @@ package reedsolomon import ( "runtime" + "strings" "github.com/klauspost/cpuid/v2" ) @@ -15,15 +16,21 @@ type options struct { shardSize int perRound int - useGFNI, useAVX512, useAVX2, useSSSE3, useSSE2 bool - useJerasureMatrix bool - usePAR1Matrix bool - useCauchy bool - fastOneParity bool - inversionCache bool - forcedInversionCache bool - customMatrix [][]byte - withLeopard leopardMode + useAvxGNFI, + useAvx512GFNI, + useAVX512, + useAVX2, + useSSSE3, + useSSE2 bool + + useJerasureMatrix bool + usePAR1Matrix bool + useCauchy bool + fastOneParity bool + inversionCache bool + forcedInversionCache bool + customMatrix [][]byte + withLeopard leopardMode // stream options concReads bool @@ -38,11 +45,12 @@ var defaultOptions = options{ inversionCache: true, // Detect CPU capabilities. - useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), - useSSE2: cpuid.CPU.Supports(cpuid.SSE2), - useAVX2: cpuid.CPU.Supports(cpuid.AVX2), - useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), - useGFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useSSSE3: cpuid.CPU.Supports(cpuid.SSSE3), + useSSE2: cpuid.CPU.Supports(cpuid.SSE2), + useAVX2: cpuid.CPU.Supports(cpuid.AVX2), + useAVX512: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512VL), + useAvx512GFNI: cpuid.CPU.Supports(cpuid.AVX512F, cpuid.GFNI, cpuid.AVX512DQ), + useAvxGNFI: cpuid.CPU.Supports(cpuid.AVX, cpuid.GFNI), } // leopardMode controls the use of leopard GF in encoding and decoding. @@ -159,10 +167,14 @@ func WithSSSE3(enabled bool) Option { } // WithAVX2 allows to enable/disable AVX2 instructions. -// If not set, AVX2 will be turned on or off automatically based on CPU ID information. +// If not set, AVX will be turned on or off automatically based on CPU ID information. +// This will also disable AVX GFNI instructions. func WithAVX2(enabled bool) Option { return func(o *options) { o.useAVX2 = enabled + if o.useAvxGNFI { + o.useAvxGNFI = enabled + } } } @@ -178,7 +190,7 @@ func WithSSE2(enabled bool) Option { func WithAVX512(enabled bool) Option { return func(o *options) { o.useAVX512 = enabled - o.useGFNI = enabled + o.useAvx512GFNI = enabled } } @@ -186,7 +198,15 @@ func WithAVX512(enabled bool) Option { // If not set, GFNI will be turned on or off automatically based on CPU ID information. func WithGFNI(enabled bool) Option { return func(o *options) { - o.useGFNI = enabled + o.useAvx512GFNI = enabled + } +} + +// WithAVXGFNI allows to enable/disable GFNI with AVX instructions. +// If not set, GFNI will be turned on or off automatically based on CPU ID information. +func WithAVXGFNI(enabled bool) Option { + return func(o *options) { + o.useAvxGNFI = enabled } } @@ -275,3 +295,29 @@ func WithLeopardGF(enabled bool) Option { } } } + +func (o *options) cpuOptions() string { + var res []string + if o.useSSE2 { + res = append(res, "SSE2") + } + if o.useAVX2 { + res = append(res, "AVX2") + } + if o.useSSSE3 { + res = append(res, "SSSE3") + } + if o.useAVX512 { + res = append(res, "AVX512") + } + if o.useAvx512GFNI { + res = append(res, "AVX512+GFNI") + } + if o.useAvxGNFI { + res = append(res, "AVX+GFNI") + } + if len(res) == 0 { + return "pure Go" + } + return strings.Join(res, ",") +} diff --git a/reedsolomon.go b/reedsolomon.go index 55b56650..bebba044 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -653,12 +653,12 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro return ErrShardSize } - if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useGFNI) { + if avx2CodeGen && len(dataShard) >= r.o.perRound && len(parity) >= avx2CodeGenMinShards && ((pshufb && r.o.useAVX2) || r.o.useAvx512GFNI || r.o.useAvxGNFI) { m := make([][]byte, r.parityShards) for iRow := range m { m[iRow] = r.parity[iRow][idx : idx+1] } - if r.o.useGFNI { + if r.o.useAvx512GFNI || r.o.useAvxGNFI { r.codeSomeShardsGFNI(m, [][]byte{dataShard}, parity, len(dataShard), false) } else { r.codeSomeShardsAVXP(m, [][]byte{dataShard}, parity, len(dataShard), false) @@ -810,7 +810,7 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { } func (r *reedSolomon) canGFNI(byteCount int, inputs, outputs int) bool { - return avx2CodeGen && r.o.useGFNI && + return avx2CodeGen && (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount >= avx2CodeGenMinSize && inputs+outputs >= avx2CodeGenMinShards && inputs <= maxAvx2Inputs && outputs <= maxAvx2Outputs } @@ -841,7 +841,11 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if r.canGFNI(byteCount, len(inputs), len(outputs)) { var gfni [maxAvx2Inputs * maxAvx2Outputs]uint64 m := genGFNIMatrix(matrixRows, len(inputs), 0, len(outputs), gfni[:]) - start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(m, inputs, outputs, 0, byteCount) + } else { + start += galMulSlicesAvxGFNI(m, inputs, outputs, 0, byteCount) + } end = len(inputs[0]) } else if r.canAVX2C(byteCount, len(inputs), len(outputs)) { m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) @@ -867,22 +871,28 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteC if len(outPer) > maxAvx2Outputs { outPer = outPer[:maxAvx2Outputs] } - if r.o.useGFNI { + if r.o.useAvx512GFNI { m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) if inIdx == 0 { - galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) + start = galMulSlicesGFNI(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + start = galMulSlicesGFNIXor(m, inPer, outPer, 0, byteCount) + } + } else if r.o.useAvxGNFI { + m := genGFNIMatrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), gfni[:]) + if inIdx == 0 { + start = galMulSlicesAvxGFNI(m, inPer, outPer, 0, byteCount) + } else { + start = galMulSlicesAvxGFNIXor(m, inPer, outPer, 0, byteCount) } } else { m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) if inIdx == 0 { - galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + start = galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) } else { - galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + start = galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) } } - start = byteCount & avxSizeMask outIdx += len(outPer) outs = outs[len(outPer):] } @@ -928,7 +938,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte } else if useAvx2 { avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.getTmpSlice()) defer r.putTmpSlice(avx2Matrix) - } else if r.o.useGFNI && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + } else if (r.o.useAvx512GFNI || r.o.useAvxGNFI) && byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canGFNI(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { // It appears there is a switchover point at around 10MB where // Regular processing is faster... @@ -950,7 +960,11 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byte exec := func(start, stop int) { if stop-start >= 64 { if useGFNI { - start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + if r.o.useAvx512GFNI { + start += galMulSlicesGFNI(gfniMatrix, inputs, outputs, start, stop) + } else { + start += galMulSlicesAvxGFNI(gfniMatrix, inputs, outputs, start, stop) + } } else if useAvx2 { start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) } @@ -1099,14 +1113,15 @@ func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, b for lstart < stop { if lstop-lstart >= minAvx2Size { // Execute plan... + var n int for _, p := range plan { if p.first { - galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + n = galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) } else { - galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + n = galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { @@ -1248,14 +1263,25 @@ func (r *reedSolomon) codeSomeShardsGFNI(matrixRows, inputs, outputs [][]byte, b for lstart < stop { if lstop-lstart >= minAvx2Size { // Execute plan... - for _, p := range plan { - if p.first { - galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) - } else { - galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + var n int + if r.o.useAvx512GFNI { + for _, p := range plan { + if p.first { + n = galMulSlicesGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesGFNIXor(p.m, p.input, p.output, lstart, lstop) + } + } + } else { + for _, p := range plan { + if p.first { + n = galMulSlicesAvxGFNI(p.m, p.input, p.output, lstart, lstop) + } else { + n = galMulSlicesAvxGFNIXor(p.m, p.input, p.output, lstart, lstop) + } } } - lstart += (lstop - lstart) & avxSizeMask + lstart += n if lstart == lstop { lstop += r.o.perRound if lstop > stop { diff --git a/reedsolomon_test.go b/reedsolomon_test.go index a1745a35..b852f002 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -24,10 +24,17 @@ var noSSE2 = flag.Bool("no-sse2", !defaultOptions.useSSE2, "Disable SSE2") var noSSSE3 = flag.Bool("no-ssse3", !defaultOptions.useSSSE3, "Disable SSSE3") var noAVX2 = flag.Bool("no-avx2", !defaultOptions.useAVX2, "Disable AVX2") var noAVX512 = flag.Bool("no-avx512", !defaultOptions.useAVX512, "Disable AVX512") -var noGNFI = flag.Bool("no-gfni", !defaultOptions.useGFNI, "Disable AVX512+GFNI") +var noGNFI = flag.Bool("no-gfni", !defaultOptions.useAvx512GFNI, "Disable AVX512+GFNI") +var noAVX2GNFI = flag.Bool("no-avx-gfni", !defaultOptions.useAvx512GFNI, "Disable AVX+GFNI") func TestMain(m *testing.M) { flag.Parse() + rs, _ := New(10, 3, testOptions()...) + if rs != nil { + if rst, ok := rs.(*reedSolomon); ok { + fmt.Println("Using", rst.o.cpuOptions()) + } + } os.Exit(m.Run()) } @@ -48,6 +55,9 @@ func testOptions(o ...Option) []Option { if *noGNFI { o = append(o, WithGFNI(false)) } + if *noAVX2GNFI { + o = append(o, WithAVXGFNI(false)) + } return o } @@ -204,7 +214,7 @@ func testOpts() [][]Option { n = append(n, WithAVX512(true)) opts = append(opts, n) } - if defaultOptions.useGFNI { + if defaultOptions.useAvx512GFNI { n := make([]Option, len(o), len(o)+1) copy(n, o) n = append(n, WithGFNI(false))