|
1 | 1 | /* |
2 | | - * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. |
| 2 | + * Copyright (c) 2025, 2026, Oracle and/or its affiliates. All rights reserved. |
3 | 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | 4 | * |
5 | 5 | * This code is free software; you can redistribute it and/or modify it |
@@ -64,6 +64,39 @@ static address kyberAvx512ConstsAddr(int offset) { |
64 | 64 |
|
65 | 65 | const Register scratch = r10; |
66 | 66 |
|
| 67 | +ATTRIBUTE_ALIGNED(64) static const uint8_t kyberAvx512_12To16Dup[] = { |
| 68 | +// 0 - 63 |
| 69 | + 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16, |
| 70 | + 16, 17, 18, 19, 19, 20, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, |
| 71 | + 31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 40, 40, 41, 42, 43, 43, 44, |
| 72 | + 45, 46, 46, 47 |
| 73 | + }; |
| 74 | + |
| 75 | +static address kyberAvx512_12To16DupAddr() { |
| 76 | + return (address) kyberAvx512_12To16Dup; |
| 77 | +} |
| 78 | + |
| 79 | +ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512_12To16Shift[] = { |
| 80 | +// 0 - 31 |
| 81 | + 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, |
| 82 | + 4, 0, 4, 0, 4, 0, 4 |
| 83 | + }; |
| 84 | + |
| 85 | +static address kyberAvx512_12To16ShiftAddr() { |
| 86 | + return (address) kyberAvx512_12To16Shift; |
| 87 | +} |
| 88 | + |
| 89 | +ATTRIBUTE_ALIGNED(64) static const uint64_t kyberAvx512_12To16And[] = { |
| 90 | +// 0 - 7 |
| 91 | + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, |
| 92 | + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, |
| 93 | + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF |
| 94 | + }; |
| 95 | + |
| 96 | +static address kyberAvx512_12To16AndAddr() { |
| 97 | + return (address) kyberAvx512_12To16And; |
| 98 | +} |
| 99 | + |
67 | 100 | ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512NttPerms[] = { |
68 | 101 | // 0 |
69 | 102 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, |
@@ -822,10 +855,65 @@ address generate_kyber12To16_avx512(StubGenerator *stubgen, |
822 | 855 |
|
823 | 856 | const Register perms = r11; |
824 | 857 |
|
825 | | - Label Loop; |
| 858 | + Label Loop, VBMILoop; |
826 | 859 |
|
827 | 860 | __ addptr(condensed, condensedOffs); |
828 | 861 |
|
| 862 | + if (VM_Version::supports_avx512_vbmi()) { |
| 863 | + // mask load for the first 48 bytes of each vector |
| 864 | + __ mov64(rax, 0x0000FFFFFFFFFFFF); |
| 865 | + __ kmovql(k1, rax); |
| 866 | + |
| 867 | + __ lea(perms, ExternalAddress(kyberAvx512_12To16DupAddr())); |
| 868 | + __ evmovdqub(xmm20, Address(perms), Assembler::AVX_512bit); |
| 869 | + |
| 870 | + __ lea(perms, ExternalAddress(kyberAvx512_12To16ShiftAddr())); |
| 871 | + __ evmovdquw(xmm21, Address(perms), Assembler::AVX_512bit); |
| 872 | + |
| 873 | + __ lea(perms, ExternalAddress(kyberAvx512_12To16AndAddr())); |
| 874 | + __ evmovdquq(xmm22, Address(perms), Assembler::AVX_512bit); |
| 875 | + |
| 876 | + __ align(OptoLoopAlignment); |
| 877 | + __ BIND(VBMILoop); |
| 878 | + |
| 879 | + __ evmovdqub(xmm0, k1, Address(condensed, 0), false, |
| 880 | + Assembler::AVX_512bit); |
| 881 | + __ evmovdqub(xmm1, k1, Address(condensed, 48), false, |
| 882 | + Assembler::AVX_512bit); |
| 883 | + __ evmovdqub(xmm2, k1, Address(condensed, 96), false, |
| 884 | + Assembler::AVX_512bit); |
| 885 | + __ evmovdqub(xmm3, k1, Address(condensed, 144), false, |
| 886 | + Assembler::AVX_512bit); |
| 887 | + |
| 888 | + __ evpermb(xmm4, k0, xmm20, xmm0, false, Assembler::AVX_512bit); |
| 889 | + __ evpermb(xmm5, k0, xmm20, xmm1, false, Assembler::AVX_512bit); |
| 890 | + __ evpermb(xmm6, k0, xmm20, xmm2, false, Assembler::AVX_512bit); |
| 891 | + __ evpermb(xmm7, k0, xmm20, xmm3, false, Assembler::AVX_512bit); |
| 892 | + |
| 893 | + __ evpsrlvw(xmm4, xmm4, xmm21, Assembler::AVX_512bit); |
| 894 | + __ evpsrlvw(xmm5, xmm5, xmm21, Assembler::AVX_512bit); |
| 895 | + __ evpsrlvw(xmm6, xmm6, xmm21, Assembler::AVX_512bit); |
| 896 | + __ evpsrlvw(xmm7, xmm7, xmm21, Assembler::AVX_512bit); |
| 897 | + |
| 898 | + __ evpandq(xmm0, xmm22, xmm4, Assembler::AVX_512bit); |
| 899 | + __ evpandq(xmm1, xmm22, xmm5, Assembler::AVX_512bit); |
| 900 | + __ evpandq(xmm2, xmm22, xmm6, Assembler::AVX_512bit); |
| 901 | + __ evpandq(xmm3, xmm22, xmm7, Assembler::AVX_512bit); |
| 902 | + |
| 903 | + store4regs(parsed, 0, xmm0_3, _masm); |
| 904 | + |
| 905 | + __ addptr(condensed, 192); |
| 906 | + __ addptr(parsed, 256); |
| 907 | + __ subl(parsedLength, 128); |
| 908 | + __ jcc(Assembler::greater, VBMILoop); |
| 909 | + |
| 910 | + __ leave(); // required for proper stackwalking of RuntimeStub frame |
| 911 | + __ mov64(rax, 0); // return 0 |
| 912 | + __ ret(0); |
| 913 | + |
| 914 | + return start; |
| 915 | + } |
| 916 | + |
829 | 917 | __ lea(perms, ExternalAddress(kyberAvx512_12To16PermsAddr())); |
830 | 918 |
|
831 | 919 | load4regs(xmm24_27, perms, 0, _masm); |
|
0 commit comments