From f500408e00aed7b413dc33f02e6cbe1a0942c67a Mon Sep 17 00:00:00 2001 From: Joel Date: Sat, 1 Jul 2023 13:48:43 +0930 Subject: [PATCH] replace the asm implementation for mul/square inner --- src/field_5x52_asm_impl.h | 826 ++++++++++++++++---------------------- 1 file changed, 336 insertions(+), 490 deletions(-) diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index 04a9af2105..08a4890bff 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -1,14 +1,5 @@ -/*********************************************************************** - * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - /** - * Changelog: - * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm - * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly + * Generated by CryptOpt (https://github.com/0xADE1A1DE/CryptOpt) */ #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H @@ -16,489 +7,344 @@ #include "util.h" -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" +SECP256K1_INLINE static void +secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t *b) { + uint64_t tmp0, tmp1, tmp2, tmp3; + __asm__ __volatile__( + "mov %%rdx,%%rax\n" + "mov 0x10(%%rdx),%%rdx\n" + "mulx 0x8(%%rsi),%%r10,%%r11\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%rcx,%%r8\n" + "mov (%%rax),%%rdx\n" + "mulx 0x18(%%rsi),%%r9,%%rbx\n" + "mov 0x8(%%rax),%%rdx\n" + "mulx 0x10(%%rsi),%%r12,%%r13\n" + "xor %%rdx,%%rdx\n" + "adox %%r12,%%r9\n" + "adox %%rbx,%%r13\n" + "mov 0x18(%%rax),%%rdx\n" + "mulx (%%rsi),%%rbx,%%r12\n" + "adcx %%r10,%%r9\n" + "adcx %%r13,%%r11\n" + "xor %%rdx,%%rdx\n" + "adox %%rbx,%%r9\n" + "adox %%r11,%%r12\n" + "mov (%%rax),%%rdx\n" + "mulx 0x20(%%rsi),%%r10,%%r13\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx 0x8(%%rax),%%rbx,%%r11\n" + "adcx %%rbx,%%r10\n" + "adcx %%r13,%%r11\n" + "mov 0x8(%%rsi),%%rdx\n" + "mulx (%%rax),%%r13,%%rbx\n" + "movabs $0x1000003d10,%%rdx\n" + "mulx %%rcx,%%r14,%%r15\n" + "mov 0x18(%%rax),%%rdx\n" + "mov %%rdi,%q0\n" + "mulx 0x8(%%rsi),%%rcx,%%rdi\n" + "xor %%rdx,%%rdx\n" + "adox %%r9,%%r14\n" + "adox %%r15,%%r12\n" + "mov 0x10(%%rax),%%rdx\n" + "mulx 0x10(%%rsi),%%r9,%%r15\n" + "adcx %%r9,%%r10\n" + "adcx %%r11,%%r15\n" + "xor %%rdx,%%rdx\n" + "adox %%rcx,%%r10\n" + "adox %%r15,%%rdi\n" + "mov (%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%r11,%%rcx\n" + "adcx %%r11,%%r10\n" + "adcx %%rdi,%%rcx\n" + "mov %%r14,%%rdx\n" + "shrd $0x34,%%r12,%%rdx\n" + "add %%r10,%%rdx\n" + "adc $0x0,%%rcx\n" + "mov %%rdx,%%r12\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx 0x8(%%rax),%%r9,%%r15\n" + "movabs $0x1000003d10000,%%rdx\n" + "mulx %%r8,%%rdi,%%r11\n" + "add %%r12,%%rdi\n" + "adcx %%r11,%%rcx\n" + "mov %%rdi,%%r10\n" + "shrd $0x34,%%rcx,%%r10\n" + "mov 0x20(%%rax),%%rdx\n" + "mulx 0x8(%%rsi),%%r12,%%r8\n" + "mov 0x10(%%rax),%%rdx\n" + "mulx 0x18(%%rsi),%%r11,%%rcx\n" + "add %%r11,%%r9\n" + "adcx %%r15,%%rcx\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x18(%%rax),%%r15,%%r11\n" + "add %%r15,%%r9\n" + "adcx %%rcx,%%r11\n" + "xor %%rdx,%%rdx\n" + "adox %%r12,%%r9\n" + "adox %%r11,%%r8\n" + "adcx %%r9,%%r10\n" + "adc $0x0,%%r8\n" + "movabs $0xfffffffffffff,%%r12\n" + "mov %%r10,%%rcx\n" + "and %%r12,%%rcx\n" + "shl $0x4,%%rcx\n" + "mov 0x18(%%rax),%%rdx\n" + "mulx 0x18(%%rsi),%%r15,%%r11\n" + "and %%r12,%%rdi\n" + "mov %%rdi,%%rdx\n" + "shr $0x30,%%rdx\n" + "lea (%%rcx,%%rdx,1),%%rcx\n" + "shrd $0x34,%%r8,%%r10\n" + "mov (%%rsi),%%rdx\n" + "mulx (%%rax),%%r9,%%r8\n" + "movabs $0x1000003d1,%%rdx\n" + "mov %%rbx,%q1\n" + "mulx %%rcx,%%r12,%%rbx\n" + "add %%r9,%%r12\n" + "adcx %%rbx,%%r8\n" + "mov 0x10(%%rax),%%rdx\n" + "mulx 0x20(%%rsi),%%rcx,%%r9\n" + "xor %%rdx,%%rdx\n" + "adox %%r15,%%rcx\n" + "adox %%r9,%%r11\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%r15,%%rbx\n" + "mov %%r12,%%rdx\n" + "shrd $0x34,%%r8,%%rdx\n" + "xor %%r8,%%r8\n" + "adox %%r15,%%rcx\n" + "adox %%r11,%%rbx\n" + "adcx %%rcx,%%r10\n" + "adc $0x0,%%rbx\n" + "mov %%r10,%%r9\n" + "shrd $0x34,%%rbx,%%r9\n" + "movabs $0xfffffffffffff,%%r11\n" + "and %%r11,%%r10\n" + "mov %%rdx,%%r15\n" + "mov 0x8(%%rax),%%rdx\n" + "mulx (%%rsi),%%rcx,%%rbx\n" + "adox %%rcx,%%r13\n" + "adox %q1,%%rbx\n" + "adcx %%r13,%%r15\n" + "adc $0x0,%%rbx\n" + "mov (%%rax),%%rdx\n" + "mulx 0x10(%%rsi),%%rcx,%%r13\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx 0x18(%%rax),%%r8,%%r11\n" + "mov 0x20(%%rax),%%rdx\n" + "mov %%r13,%q2\n" + "mov %%rcx,%q3\n" + "mulx 0x18(%%rsi),%%r13,%%rcx\n" + "xor %%rdx,%%rdx\n" + "adox %%r13,%%r8\n" + "adox %%r11,%%rcx\n" + "movabs $0x1000003d10,%%r11\n" + "mov %%r11,%%rdx\n" + "mulx %%r10,%%r11,%%r13\n" + "adcx %%r8,%%r9\n" + "adc $0x0,%%rcx\n" + "add %%r15,%%r11\n" + "adcx %%r13,%%rbx\n" + "mov %%r11,%%r10\n" + "shrd $0x34,%%rbx,%%r10\n" + "mov 0x8(%%rax),%%rdx\n" + "mulx 0x8(%%rsi),%%r15,%%r8\n" + "movabs $0xfffffffffffff,%%rdx\n" + "and %%rdx,%%r11\n" + "mov %q0,%%r13\n" + "mov %%r11,0x8(%%r13)\n" + "movabs $0x1000003d10000,%%rbx\n" + "mov %%rbx,%%rdx\n" + "mulx %%rcx,%%rbx,%%r11\n" + "mov %%r15,%%rcx\n" + "adox %q3,%%rcx\n" + "adox %q2,%%r8\n" + "mov 0x10(%%rax),%%rdx\n" + "mulx (%%rsi),%%r15,%%r13\n" + "adcx %%r15,%%rcx\n" + "adcx %%r8,%%r13\n" + "add %%rcx,%%r10\n" + "adc $0x0,%%r13\n" + "movabs $0x1000003d10,%%rdx\n" + "mulx %%r9,%%r8,%%r15\n" + "xor %%r9,%%r9\n" + "adox %%r10,%%r8\n" + "adox %%r15,%%r13\n" + "movabs $0xfffffffffffff,%%rcx\n" + "mov %%r8,%%r10\n" + "and %%rcx,%%r10\n" + "mov %q0,%%r15\n" + "mov %%r10,0x10(%%r15)\n" + "and %%rcx,%%r14\n" + "shrd $0x34,%%r13,%%r8\n" + "movabs $0xffffffffffff,%%r13\n" + "and %%r13,%%rdi\n" + "lea (%%r14,%%r8,1),%%r14\n" + "adox %%r14,%%rbx\n" + "adox %%r9,%%r11\n" + "mov %%rbx,%%r10\n" + "shrd $0x34,%%r11,%%r10\n" + "and %%rcx,%%rbx\n" + "mov %%rbx,0x18(%%r15)\n" + "and %%rcx,%%r12\n" + "mov %%r12,(%%r15)\n" + "lea (%%rdi,%%r10,1),%%rdi\n" + "mov %%rdi,0x20(%%r15)\n" - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); + : "=&m"(tmp0), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3), "+D"(r), "+d"(b) + : "S"(a) + : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", + "r14", "r15", "cc", "memory"); } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" + uint64_t tmp0, tmp1, tmp2; - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} + __asm__ __volatile__( + "mov 0x8(%%rsi),%%rax\n" + "lea (%%rax,%%rax,1),%%r10\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx %%rdx,%%rax,%%r11\n" + "mov (%%rsi),%%rdx\n" + "lea (%%rdx,%%rdx,1),%%rcx\n" + "mov %%rcx,%%rdx\n" + "mulx 0x18(%%rsi),%%rcx,%%r8\n" + "mov %%rdx,%%r9\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx %%r10,%%rbx,%%r12\n" + "xor %%rdx,%%rdx\n" + "adox %%rcx,%%rbx\n" + "adox %%r12,%%r8\n" + "movabs $0x1000003d10,%%rcx\n" + "mov %%rax,%%rdx\n" + "mulx %%rcx,%%rax,%%r12\n" + "adcx %%rbx,%%rax\n" + "adcx %%r12,%%r8\n" + "mov %%rax,%%rdx\n" + "shrd $0x34,%%r8,%%rdx\n" + "mov %%rdx,%%rbx\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx %%rdx,%%r12,%%r8\n" + "mov %%r10,%%rdx\n" + "mulx 0x18(%%rsi),%%r10,%%r13\n" + "xor %%r14,%%r14\n" + "adox %%r10,%%r12\n" + "adox %%r8,%%r13\n" + "mov %%rdx,%%r8\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx %%r9,%%r10,%%r14\n" + "adcx %%r10,%%r12\n" + "adcx %%r13,%%r14\n" + "add %%r12,%%rbx\n" + "adc $0x0,%%r14\n" + "movabs $0x1000003d10000,%%rdx\n" + "mulx %%r11,%%r13,%%r10\n" + "mov 0x10(%%rsi),%%r11\n" + "mov %%r11,%%r12\n" + "shl %%r12\n" + "mov %%r8,%%rdx\n" + "mulx 0x20(%%rsi),%%r11,%%r8\n" + "mov %%r12,%%rdx\n" + "mulx 0x18(%%rsi),%%r12,%%r15\n" + "add %%rbx,%%r13\n" + "adcx %%r10,%%r14\n" + "mov %%r13,%%rbx\n" + "shrd $0x34,%%r14,%%rbx\n" + "movabs $0xfffffffffffff,%%r10\n" + "and %%r10,%%r13\n" + "adox %%r11,%%r12\n" + "adox %%r15,%%r8\n" + "adcx %%r12,%%rbx\n" + "adc $0x0,%%r8\n" + "mov %%rdx,%%r11\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx %%rdx,%%r15,%%r14\n" + "mov %%r11,%%rdx\n" + "mulx 0x20(%%rsi),%%r11,%%r12\n" + "mov %%rbx,%%rdx\n" + "and %%r10,%%rdx\n" + "shrd $0x34,%%r8,%%rbx\n" + "xor %%r8,%%r8\n" + "adox %%r11,%%r15\n" + "adox %%r14,%%r12\n" + "adcx %%r15,%%rbx\n" + "adc $0x0,%%r12\n" + "mov %%rbx,%%r14\n" + "and %%r10,%%r14\n" + "shl $0x4,%%rdx\n" + "mov %%r13,%%r11\n" + "shr $0x30,%%r11\n" + "shrd $0x34,%%r12,%%rbx\n" + "lea (%%rdx,%%r11,1),%%rdx\n" + "movabs $0x1000003d1,%%r15\n" + "mulx %%r15,%%r12,%%r11\n" + "imul $0x2,0x18(%%rsi),%%rdx\n" + "mulx 0x20(%%rsi),%%r8,%%r15\n" + "mov (%%rsi),%%rdx\n" + "mulx %%rdx,%%r10,%%rcx\n" + "xor %%rdx,%%rdx\n" + "adox %%r10,%%r12\n" + "adox %%r11,%%rcx\n" + "adcx %%r8,%%rbx\n" + "adc $0x0,%%r15\n" + "mov %%r12,%%r11\n" + "shrd $0x34,%%rcx,%%r11\n" + "mov 0x8(%%rsi),%%rdx\n" + "mulx %%r9,%%r8,%%r10\n" + "add %%r8,%%r11\n" + "adc $0x0,%%r10\n" + "mov 0x8(%%rsi),%%rdx\n" + "mulx %%rdx,%%rcx,%%r8\n" + "mov %%r9,%%rdx\n" + "mov %%rdi,%q0\n" + "mulx 0x10(%%rsi),%%r9,%%rdi\n" + "movabs $0x1000003d10,%%rdx\n" + "mov %%rbx,%q1\n" + "mov %%r8,%q2\n" + "mulx %%r14,%%rbx,%%r8\n" + "xor %%r14,%%r14\n" + "adox %%r11,%%rbx\n" + "adox %%r8,%%r10\n" + "adcx %%r9,%%rcx\n" + "adcx %q2,%%rdi\n" + "mov %%rbx,%%r11\n" + "shrd $0x34,%%r10,%%r11\n" + "add %%rcx,%%r11\n" + "adc $0x0,%%rdi\n" + "mulx %q1,%%r9,%%r8\n" + "xor %%r10,%%r10\n" + "adox %%r11,%%r9\n" + "adox %%r8,%%rdi\n" + "movabs $0xfffffffffffff,%%r14\n" + "and %%r14,%%rbx\n" + "mov %%r9,%%rcx\n" + "shrd $0x34,%%rdi,%%rcx\n" + "mov %q0,%%r11\n" + "mov %%rbx,0x8(%%r11)\n" + "movabs $0x1000003d10000,%%r8\n" + "mov %%r15,%%rdx\n" + "mulx %%r8,%%r15,%%rdi\n" + "and %%r14,%%r9\n" + "mov %%r9,0x10(%%r11)\n" + "and %%r14,%%rax\n" + "lea (%%rax,%%rcx,1),%%rax\n" + "adox %%rax,%%r15\n" + "adox %%r10,%%rdi\n" + "mov %%r15,%%rbx\n" + "and %%r14,%%rbx\n" + "shrd $0x34,%%rdi,%%r15\n" + "mov %%rbx,0x18(%%r11)\n" + "movabs $0xffffffffffff,%%rcx\n" + "and %%rcx,%%r13\n" + "lea 0x0(%%r13,%%r15,1),%%r13\n" + "mov %%r13,0x20(%%r11)\n" + "and %%r14,%%r12\n" + "mov %%r12,(%%r11)\n" -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ + : "=&m"(tmp0), "=&m"(tmp1), "=&m"(tmp2), "+D"(r) + : "D"(r), "S"(a) + : "rax", "rbx", "rcx", "rdx", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "cc", "memory"); +} +#endif