From e2684293b1b72a1ab5974a2864549cea2788cf95 Mon Sep 17 00:00:00 2001 From: Joel Date: Tue, 28 Mar 2023 17:05:18 +0900 Subject: [PATCH 1/2] Replace x86_64 field asm by CryptOpt output Co-authored-by: Tim Ruffing --- Makefile.am | 3 + configure.ac | 4 + src/asm/mul.s | 200 +++++++++++++++ src/asm/square.s | 161 ++++++++++++ src/field_5x52_asm_impl.h | 502 -------------------------------------- src/field_5x52_impl.h | 6 +- 6 files changed, 372 insertions(+), 504 deletions(-) create mode 100644 src/asm/mul.s create mode 100644 src/asm/square.s delete mode 100644 src/field_5x52_asm_impl.h diff --git a/Makefile.am b/Makefile.am index e3fdf4da27..b899b5e51b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -89,6 +89,9 @@ pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libsecp256k1.pc if USE_EXTERNAL_ASM +if USE_ASM_X86_64 +libsecp256k1_common_la_SOURCES = src/asm/mul.s src/asm/square.s +endif if USE_ASM_ARM libsecp256k1_common_la_SOURCES = src/asm/field_10x26_arm.s endif diff --git a/configure.ac b/configure.ac index 46c95e30bc..62dd0273f3 100644 --- a/configure.ac +++ b/configure.ac @@ -261,6 +261,7 @@ else fi if test x"$req_asm" = x"auto"; then + # TODO SECP_64BIT_ASM_CHECK if test x"$has_64bit_asm" = x"yes"; then set_asm=x86_64 @@ -272,6 +273,7 @@ else set_asm=$req_asm case $set_asm in x86_64) + # TODO SECP_64BIT_ASM_CHECK if test x"$has_64bit_asm" != x"yes"; then AC_MSG_ERROR([x86_64 assembly optimization requested but not available]) @@ -293,6 +295,7 @@ enable_external_asm=no case $set_asm in x86_64) SECP_CONFIG_DEFINES="$SECP_CONFIG_DEFINES -DUSE_ASM_X86_64=1" + enable_external_asm=yes ;; arm) enable_external_asm=yes @@ -435,6 +438,7 @@ AM_CONDITIONAL([ENABLE_MODULE_EXTRAKEYS], [test x"$enable_module_extrakeys" = x" AM_CONDITIONAL([ENABLE_MODULE_SCHNORRSIG], [test x"$enable_module_schnorrsig" = x"yes"]) AM_CONDITIONAL([USE_EXTERNAL_ASM], [test x"$enable_external_asm" = x"yes"]) AM_CONDITIONAL([USE_ASM_ARM], [test x"$set_asm" = x"arm"]) +AM_CONDITIONAL([USE_ASM_X86_64], [test x"$set_asm" = x"x86_64"]) AM_CONDITIONAL([BUILD_WINDOWS], [test "$build_windows" = "yes"]) AC_SUBST(LIB_VERSION_CURRENT, _LIB_VERSION_CURRENT) AC_SUBST(LIB_VERSION_REVISION, _LIB_VERSION_REVISION) diff --git a/src/asm/mul.s b/src/asm/mul.s new file mode 100644 index 0000000000..7b0c068ef9 --- /dev/null +++ b/src/asm/mul.s @@ -0,0 +1,200 @@ +.text +.global secp256k1_fe_mul_inner +secp256k1_fe_mul_inner: + mov %rdx,%rax + mov 0x20(%rdx),%rdx + mulx 0x20(%rsi),%r10,%r11 + mov 0x20(%rax),%rdx + mulx 0x18(%rsi),%rcx,%r8 + mov 0x10(%rax),%rdx + mov %rbx,-0x80(%rsp) + mulx 0x8(%rsi),%r9,%rbx + mov 0x18(%rax),%rdx + mov %rbp,-0x78(%rsp) + mov %r12,-0x70(%rsp) + mulx 0x10(%rsi),%rbp,%r12 + mov (%rax),%rdx + mov %r13,-0x68(%rsp) + mov %r14,-0x60(%rsp) + mulx 0x18(%rsi),%r13,%r14 + mov 0x8(%rsi),%rdx + mov %r15,-0x58(%rsp) + mov %rdi,-0x50(%rsp) + mulx 0x18(%rax),%r15,%rdi + mov %r10,%rdx + shrd $0x34,%r11,%rdx + mov %rdx,%r11 + mov 0x10(%rsi),%rdx + mov %r8,-0x48(%rsp) + mov %rcx,-0x40(%rsp) + mulx 0x8(%rax),%r8,%rcx + add %r8,%r13 + adcx %r14,%rcx + movabs $0x1000003d10,%rdx + mulx %r11,%r14,%r8 + mov (%rsi),%rdx + mov %r12,-0x38(%rsp) + mulx 0x18(%rax),%r11,%r12 + test %al,%al + adox %r9,%r13 + adox %rcx,%rbx + adcx %r11,%r13 + adcx %rbx,%r12 + movabs $0xfffffffffffff,%rdx + and %rdx,%r10 + movabs $0x1000003d10,%r9 + mov %r9,%rdx + mulx %r10,%r9,%rcx + mov 0x8(%rax),%rdx + mulx 0x18(%rsi),%r11,%rbx + mov 0x20(%rsi),%rdx + mov %rbp,-0x30(%rsp) + mulx (%rax),%r10,%rbp + adox %r11,%r10 + adox %rbp,%rbx + adcx %r13,%r9 + adcx %rcx,%r12 + mov 0x10(%rsi),%rdx + mulx 0x10(%rax),%r13,%rcx + xor %rdx,%rdx + adox %r13,%r10 + adox %rbx,%rcx + adcx %r15,%r10 + adcx %rcx,%rdi + mov $0x34,%r15d + bzhi %r15,%r9,%r11 + mov 0x10(%rax),%rdx + mulx 0x18(%rsi),%rbp,%rbx + mov 0x20(%rax),%rdx + mulx (%rsi),%r13,%rcx + adox %r13,%r10 + adox %rdi,%rcx + shrd $0x34,%r12,%r9 + mov 0x20(%rsi),%rdx + mulx 0x8(%rax),%r12,%rdi + add %rbp,%r12 + adcx %rdi,%rbx + xor %rdx,%rdx + adox %r10,%r9 + adox %rdx,%rcx + adcx %r9,%r14 + adcx %r8,%rcx + xor %r8,%r8 + adox -0x30(%rsp),%r12 + adox -0x38(%rsp),%rbx + bzhi %r15,%r14,%rdx + mov %rdx,%rbp + mov 0x20(%rax),%rdx + mulx 0x8(%rsi),%r13,%r10 + mov $0x30,%edx + bzhi %rdx,%rbp,%rdi + mov 0x10(%rax),%rdx + mulx 0x20(%rsi),%r9,%r8 + adox %r13,%r12 + adox %rbx,%r10 + shrd $0x34,%rcx,%r14 + xor %rdx,%rdx + adox %r12,%r14 + adox %rdx,%r10 + bzhi %r15,%r14,%rcx + shrd $0x34,%r10,%r14 + mov 0x18(%rsi),%rdx + mulx 0x18(%rax),%rbx,%r13 + mov 0x20(%rax),%rdx + mulx 0x10(%rsi),%r12,%r10 + add %rbx,%r9 + adcx %r8,%r13 + shl $0x4,%rcx + add %r12,%r9 + adcx %r13,%r10 + shr $0x30,%rbp + lea (%rcx,%rbp,1),%rcx + movabs $0x1000003d1,%rdx + mulx %rcx,%r8,%rbx + mov (%rax),%rdx + mulx (%rsi),%r12,%r13 + mov 0x18(%rax),%rdx + mulx 0x20(%rsi),%rbp,%rcx + test %al,%al + adox %r12,%r8 + adox %rbx,%r13 + adcx -0x40(%rsp),%rbp + adcx -0x48(%rsp),%rcx + mov (%rsi),%rdx + mulx 0x8(%rax),%rbx,%r12 + xor %rdx,%rdx + adox %r9,%r14 + adox %rdx,%r10 + mov %r8,%r9 + shrd $0x34,%r13,%r9 + bzhi %r15,%r14,%r13 + shrd $0x34,%r10,%r14 + xor %r10,%r10 + adox %rbp,%r14 + adox %r10,%rcx + mov 0x8(%rsi),%rdx + mulx (%rax),%rbp,%r10 + mov 0x8(%rax),%rdx + mov %rdi,-0x28(%rsp) + mulx 0x8(%rsi),%r15,%rdi + mov 0x10(%rsi),%rdx + mov %r11,-0x20(%rsp) + mov %rcx,-0x18(%rsp) + mulx (%rax),%r11,%rcx + adcx %r15,%r11 + adcx %rcx,%rdi + test %al,%al + adox %rbx,%rbp + adox %r10,%r12 + adcx %rbp,%r9 + adc $0x0,%r12 + movabs $0x1000003d10,%rdx + mulx %r13,%rbx,%r10 + add %r9,%rbx + adcx %r10,%r12 + mov 0x10(%rax),%rdx + mulx (%rsi),%r13,%r15 + mov $0x34,%edx + bzhi %rdx,%rbx,%rcx + bzhi %rdx,%r14,%rbp + movabs $0x1000003d10,%r9 + mov %rbp,%rdx + mulx %r9,%rbp,%r10 + shrd $0x34,%r12,%rbx + mov -0x50(%rsp),%r12 + mov %rcx,0x8(%r12) + add %r13,%r11 + adcx %rdi,%r15 + xor %rdi,%rdi + adox %r11,%rbx + adox %rdi,%r15 + adcx %rbx,%rbp + adcx %r10,%r15 + mov %rbp,%r13 + shrd $0x34,%r15,%r13 + add -0x20(%rsp),%r13 + mov -0x18(%rsp),%rcx + shrd $0x34,%rcx,%r14 + mov %r9,%rdx + mulx %r14,%r9,%rcx + xor %r10,%r10 + adox %r13,%r9 + adox %r10,%rcx + movabs $0xfffffffffffff,%rdi + mov %r9,%r11 + and %rdi,%r11 + shrd $0x34,%rcx,%r9 + add -0x28(%rsp),%r9 + and %rdi,%r8 + mov %r9,0x20(%r12) + mov %r8,(%r12) + and %rdi,%rbp + mov %rbp,0x10(%r12) + mov %r11,0x18(%r12) + mov -0x80(%rsp),%rbx + mov -0x78(%rsp),%rbp + mov -0x70(%rsp),%r12 + mov -0x68(%rsp),%r13 + mov -0x60(%rsp),%r14 + mov -0x58(%rsp),%r15 + ret diff --git a/src/asm/square.s b/src/asm/square.s new file mode 100644 index 0000000000..54b0898722 --- /dev/null +++ b/src/asm/square.s @@ -0,0 +1,161 @@ +.text +.global secp256k1_fe_sqr_inner +secp256k1_fe_sqr_inner: + mov $0x1,%eax + shlx %rax,0x10(%rsi),%r10 + mov 0x20(%rsi),%rdx + mulx %rdx,%r11,%rcx + shlx %rax,(%rsi),%rdx + mov $0x34,%r8d + bzhi %r8,%r11,%r9 + mov %rbx,-0x80(%rsp) + mulx 0x20(%rsi),%rax,%rbx + mov %rbp,-0x78(%rsp) + mov $0x1,%ebp + mov %r12,-0x70(%rsp) + shlx %rbp,0x8(%rsi),%r12 + mov %rdx,%rbp + mov 0x18(%rsi),%rdx + mov %r13,-0x68(%rsp) + mov %r14,-0x60(%rsp) + mulx %r12,%r13,%r14 + mov (%rsi),%rdx + mov %r15,-0x58(%rsp) + mulx %rdx,%r15,%r8 + mov 0x10(%rsi),%rdx + mov %rdi,-0x50(%rsp) + mov %r8,-0x48(%rsp) + mulx %rdx,%rdi,%r8 + adox %r13,%rdi + adox %r8,%r14 + mov %r12,%rdx + mulx 0x10(%rsi),%r12,%r13 + add %rax,%rdi + adcx %r14,%rbx + xchg %rdx,%rbp + mulx 0x18(%rsi),%rax,%r8 + xor %r14,%r14 + adox %rax,%r12 + adox %r13,%r8 + movabs $0x1000003d10,%r13 + xchg %rdx,%r13 + mulx %r9,%rax,%r14 + shrd $0x34,%rcx,%r11 + xor %rcx,%rcx + adox %r12,%rax + adox %r14,%r8 + mov %rax,%r9 + shrd $0x34,%r8,%r9 + test %al,%al + adox %rdi,%r9 + adox %rcx,%rbx + mulx %r11,%rdi,%r12 + mov 0x20(%rsi),%rdx + mulx %rbp,%r14,%r11 + adcx %r9,%rdi + adcx %r12,%rbx + mov 0x18(%rsi),%rdx + mulx %r10,%rbp,%r8 + mov %rdi,%rdx + shrd $0x34,%rbx,%rdx + xor %r9,%r9 + adox %r14,%rbp + adox %r8,%r11 + adcx %rbp,%rdx + adc $0x0,%r11 + mov %rdx,%rcx + mov 0x20(%rsi),%rdx + mulx %r10,%r12,%r14 + mov 0x18(%rsi),%rdx + mulx %rdx,%r10,%rbx + movabs $0xfffffffffffff,%rdx + and %rdx,%rdi + adox %r12,%r10 + adox %rbx,%r14 + mov %rcx,%r8 + shrd $0x34,%r11,%r8 + add %r10,%r8 + adc $0x0,%r14 + and %rdx,%rcx + shl $0x4,%rcx + mov %r8,%rbp + shrd $0x34,%r14,%rbp + mov %rdi,%r11 + shr $0x30,%r11 + lea (%rcx,%r11,1),%rcx + mov %r13,%rdx + mulx 0x8(%rsi),%r13,%r12 + movabs $0x1000003d1,%rbx + xchg %rdx,%rcx + mulx %rbx,%r10,%r14 + mov 0x18(%rsi),%r11 + mov %r11,%rdx + shl %rdx + xor %r11,%r11 + adox %r15,%r10 + adox -0x48(%rsp),%r14 + mulx 0x20(%rsi),%r9,%r15 + movabs $0xfffffffffffff,%rdx + and %rdx,%r8 + movabs $0x1000003d10,%r11 + mov %r11,%rdx + mulx %r8,%r11,%rbx + mov %r10,%r8 + shrd $0x34,%r14,%r8 + add %r13,%r8 + adc $0x0,%r12 + test %al,%al + adox %r8,%r11 + adox %rbx,%r12 + mov %r11,%r13 + shrd $0x34,%r12,%r13 + test %al,%al + adox %r9,%rbp + mov $0x0,%r14d + adox %r14,%r15 + movabs $0xfffffffffffff,%r9 + and %r9,%r11 + mov %rbp,%rbx + shrd $0x34,%r15,%rbx + mulx %rbx,%r8,%r12 + and %r9,%r10 + mov %rcx,%rdx + mulx 0x10(%rsi),%rcx,%r15 + mov -0x50(%rsp),%rdx + mov %r10,(%rdx) + mov %rdx,%rbx + mov 0x8(%rsi),%rdx + mulx %rdx,%r10,%r14 + adox %rcx,%r10 + adox %r14,%r15 + adcx %r10,%r13 + adc $0x0,%r15 + and %r9,%rax + and %r9,%rbp + movabs $0x1000003d10,%rdx + mulx %rbp,%rcx,%r14 + adox %r13,%rcx + adox %r14,%r15 + mov %rcx,%r10 + and %r9,%r10 + shrd $0x34,%r15,%rcx + lea (%rax,%rcx,1),%rax + mov %r10,0x10(%rbx) + add %rax,%r8 + adc $0x0,%r12 + mov %r8,%r13 + shrd $0x34,%r12,%r13 + mov $0x30,%ebp + bzhi %rbp,%rdi,%r14 + lea (%r14,%r13,1),%r14 + mov %r14,0x20(%rbx) + and %r9,%r8 + mov %r11,0x8(%rbx) + mov %r8,0x18(%rbx) + mov -0x80(%rsp),%rbx + mov -0x78(%rsp),%rbp + mov -0x70(%rsp),%r12 + mov -0x68(%rsp),%r13 + mov -0x60(%rsp),%r14 + mov -0x58(%rsp),%r15 + ret diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h deleted file mode 100644 index a2118044ab..0000000000 --- a/src/field_5x52_asm_impl.h +++ /dev/null @@ -1,502 +0,0 @@ -/*********************************************************************** - * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - -/** - * Changelog: - * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm - * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly - */ - -#ifndef SECP256K1_FIELD_INNER5X52_IMPL_H -#define SECP256K1_FIELD_INNER5X52_IMPL_H - -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" - - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} - -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 4c4466eceb..8d622bdea5 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -12,8 +12,10 @@ #include "field.h" #include "modinv64_impl.h" -#if defined(USE_ASM_X86_64) -#include "field_5x52_asm_impl.h" +#if defined(USE_EXTERNAL_ASM) && defined(USE_ASM_X86_64) +extern void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t *b); +extern void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a); +#pragma message "ASM enabled" #else #include "field_5x52_int128_impl.h" #endif From 7e00a589eb21df220df1452a06bea877776737d4 Mon Sep 17 00:00:00 2001 From: Tim Ruffing Date: Fri, 31 Mar 2023 23:22:18 +0900 Subject: [PATCH 2/2] fixup! Replace x86_64 field asm by CryptOpt output --- Makefile.am | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index b899b5e51b..7552ec4677 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,7 +39,6 @@ noinst_HEADERS += src/field_10x26_impl.h noinst_HEADERS += src/field_5x52.h noinst_HEADERS += src/field_5x52_impl.h noinst_HEADERS += src/field_5x52_int128_impl.h -noinst_HEADERS += src/field_5x52_asm_impl.h noinst_HEADERS += src/modinv32.h noinst_HEADERS += src/modinv32_impl.h noinst_HEADERS += src/modinv64.h