diff --git a/Makefile.am b/Makefile.am index e3fdf4da27..cc0403cf7a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -36,6 +36,7 @@ noinst_HEADERS += src/ecmult_gen_compute_table.h noinst_HEADERS += src/ecmult_gen_compute_table_impl.h noinst_HEADERS += src/field_10x26.h noinst_HEADERS += src/field_10x26_impl.h +noinst_HEADERS += src/dettman.h noinst_HEADERS += src/field_5x52.h noinst_HEADERS += src/field_5x52_impl.h noinst_HEADERS += src/field_5x52_int128_impl.h @@ -89,12 +90,15 @@ pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = libsecp256k1.pc if USE_EXTERNAL_ASM +if USE_ASM_X86_64 +libsecp256k1_common_la_SOURCES = src/asm/mul.s src/asm/square.s +endif if USE_ASM_ARM libsecp256k1_common_la_SOURCES = src/asm/field_10x26_arm.s endif endif -libsecp256k1_la_SOURCES = src/secp256k1.c +libsecp256k1_la_SOURCES = src/secp256k1.c src/asm/square.s src/asm/mul.s libsecp256k1_la_CPPFLAGS = $(SECP_CONFIG_DEFINES) libsecp256k1_la_LIBADD = $(COMMON_LIB) $(PRECOMPUTED_LIB) libsecp256k1_la_LDFLAGS = -no-undefined -version-info $(LIB_VERSION_CURRENT):$(LIB_VERSION_REVISION):$(LIB_VERSION_AGE) diff --git a/configure.ac b/configure.ac index 1976789f1a..6259f92e53 100644 --- a/configure.ac +++ b/configure.ac @@ -264,6 +264,7 @@ else fi if test x"$req_asm" = x"auto"; then + # TODO SECP_64BIT_ASM_CHECK if test x"$has_64bit_asm" = x"yes"; then set_asm=x86_64 @@ -275,6 +276,7 @@ else set_asm=$req_asm case $set_asm in x86_64) + # TODO SECP_64BIT_ASM_CHECK if test x"$has_64bit_asm" != x"yes"; then AC_MSG_ERROR([x86_64 assembly optimization requested but not available]) @@ -296,6 +298,7 @@ enable_external_asm=no case $set_asm in x86_64) SECP_CONFIG_DEFINES="$SECP_CONFIG_DEFINES -DUSE_ASM_X86_64=1" + enable_external_asm=yes ;; arm) enable_external_asm=yes @@ -438,6 +441,7 @@ AM_CONDITIONAL([ENABLE_MODULE_EXTRAKEYS], [test x"$enable_module_extrakeys" = x" AM_CONDITIONAL([ENABLE_MODULE_SCHNORRSIG], [test x"$enable_module_schnorrsig" = x"yes"]) AM_CONDITIONAL([USE_EXTERNAL_ASM], [test x"$enable_external_asm" = x"yes"]) AM_CONDITIONAL([USE_ASM_ARM], [test x"$set_asm" = x"arm"]) +AM_CONDITIONAL([USE_ASM_X86_64], [test x"$set_asm" = x"x86_64"]) AM_CONDITIONAL([BUILD_WINDOWS], [test "$build_windows" = "yes"]) AC_SUBST(LIB_VERSION_CURRENT, _LIB_VERSION_CURRENT) AC_SUBST(LIB_VERSION_REVISION, _LIB_VERSION_REVISION) diff --git a/src/asm/mul.s b/src/asm/mul.s new file mode 100644 index 0000000000..7b0c068ef9 --- /dev/null +++ b/src/asm/mul.s @@ -0,0 +1,200 @@ +.text +.global secp256k1_fe_mul_inner +secp256k1_fe_mul_inner: + mov %rdx,%rax + mov 0x20(%rdx),%rdx + mulx 0x20(%rsi),%r10,%r11 + mov 0x20(%rax),%rdx + mulx 0x18(%rsi),%rcx,%r8 + mov 0x10(%rax),%rdx + mov %rbx,-0x80(%rsp) + mulx 0x8(%rsi),%r9,%rbx + mov 0x18(%rax),%rdx + mov %rbp,-0x78(%rsp) + mov %r12,-0x70(%rsp) + mulx 0x10(%rsi),%rbp,%r12 + mov (%rax),%rdx + mov %r13,-0x68(%rsp) + mov %r14,-0x60(%rsp) + mulx 0x18(%rsi),%r13,%r14 + mov 0x8(%rsi),%rdx + mov %r15,-0x58(%rsp) + mov %rdi,-0x50(%rsp) + mulx 0x18(%rax),%r15,%rdi + mov %r10,%rdx + shrd $0x34,%r11,%rdx + mov %rdx,%r11 + mov 0x10(%rsi),%rdx + mov %r8,-0x48(%rsp) + mov %rcx,-0x40(%rsp) + mulx 0x8(%rax),%r8,%rcx + add %r8,%r13 + adcx %r14,%rcx + movabs $0x1000003d10,%rdx + mulx %r11,%r14,%r8 + mov (%rsi),%rdx + mov %r12,-0x38(%rsp) + mulx 0x18(%rax),%r11,%r12 + test %al,%al + adox %r9,%r13 + adox %rcx,%rbx + adcx %r11,%r13 + adcx %rbx,%r12 + movabs $0xfffffffffffff,%rdx + and %rdx,%r10 + movabs $0x1000003d10,%r9 + mov %r9,%rdx + mulx %r10,%r9,%rcx + mov 0x8(%rax),%rdx + mulx 0x18(%rsi),%r11,%rbx + mov 0x20(%rsi),%rdx + mov %rbp,-0x30(%rsp) + mulx (%rax),%r10,%rbp + adox %r11,%r10 + adox %rbp,%rbx + adcx %r13,%r9 + adcx %rcx,%r12 + mov 0x10(%rsi),%rdx + mulx 0x10(%rax),%r13,%rcx + xor %rdx,%rdx + adox %r13,%r10 + adox %rbx,%rcx + adcx %r15,%r10 + adcx %rcx,%rdi + mov $0x34,%r15d + bzhi %r15,%r9,%r11 + mov 0x10(%rax),%rdx + mulx 0x18(%rsi),%rbp,%rbx + mov 0x20(%rax),%rdx + mulx (%rsi),%r13,%rcx + adox %r13,%r10 + adox %rdi,%rcx + shrd $0x34,%r12,%r9 + mov 0x20(%rsi),%rdx + mulx 0x8(%rax),%r12,%rdi + add %rbp,%r12 + adcx %rdi,%rbx + xor %rdx,%rdx + adox %r10,%r9 + adox %rdx,%rcx + adcx %r9,%r14 + adcx %r8,%rcx + xor %r8,%r8 + adox -0x30(%rsp),%r12 + adox -0x38(%rsp),%rbx + bzhi %r15,%r14,%rdx + mov %rdx,%rbp + mov 0x20(%rax),%rdx + mulx 0x8(%rsi),%r13,%r10 + mov $0x30,%edx + bzhi %rdx,%rbp,%rdi + mov 0x10(%rax),%rdx + mulx 0x20(%rsi),%r9,%r8 + adox %r13,%r12 + adox %rbx,%r10 + shrd $0x34,%rcx,%r14 + xor %rdx,%rdx + adox %r12,%r14 + adox %rdx,%r10 + bzhi %r15,%r14,%rcx + shrd $0x34,%r10,%r14 + mov 0x18(%rsi),%rdx + mulx 0x18(%rax),%rbx,%r13 + mov 0x20(%rax),%rdx + mulx 0x10(%rsi),%r12,%r10 + add %rbx,%r9 + adcx %r8,%r13 + shl $0x4,%rcx + add %r12,%r9 + adcx %r13,%r10 + shr $0x30,%rbp + lea (%rcx,%rbp,1),%rcx + movabs $0x1000003d1,%rdx + mulx %rcx,%r8,%rbx + mov (%rax),%rdx + mulx (%rsi),%r12,%r13 + mov 0x18(%rax),%rdx + mulx 0x20(%rsi),%rbp,%rcx + test %al,%al + adox %r12,%r8 + adox %rbx,%r13 + adcx -0x40(%rsp),%rbp + adcx -0x48(%rsp),%rcx + mov (%rsi),%rdx + mulx 0x8(%rax),%rbx,%r12 + xor %rdx,%rdx + adox %r9,%r14 + adox %rdx,%r10 + mov %r8,%r9 + shrd $0x34,%r13,%r9 + bzhi %r15,%r14,%r13 + shrd $0x34,%r10,%r14 + xor %r10,%r10 + adox %rbp,%r14 + adox %r10,%rcx + mov 0x8(%rsi),%rdx + mulx (%rax),%rbp,%r10 + mov 0x8(%rax),%rdx + mov %rdi,-0x28(%rsp) + mulx 0x8(%rsi),%r15,%rdi + mov 0x10(%rsi),%rdx + mov %r11,-0x20(%rsp) + mov %rcx,-0x18(%rsp) + mulx (%rax),%r11,%rcx + adcx %r15,%r11 + adcx %rcx,%rdi + test %al,%al + adox %rbx,%rbp + adox %r10,%r12 + adcx %rbp,%r9 + adc $0x0,%r12 + movabs $0x1000003d10,%rdx + mulx %r13,%rbx,%r10 + add %r9,%rbx + adcx %r10,%r12 + mov 0x10(%rax),%rdx + mulx (%rsi),%r13,%r15 + mov $0x34,%edx + bzhi %rdx,%rbx,%rcx + bzhi %rdx,%r14,%rbp + movabs $0x1000003d10,%r9 + mov %rbp,%rdx + mulx %r9,%rbp,%r10 + shrd $0x34,%r12,%rbx + mov -0x50(%rsp),%r12 + mov %rcx,0x8(%r12) + add %r13,%r11 + adcx %rdi,%r15 + xor %rdi,%rdi + adox %r11,%rbx + adox %rdi,%r15 + adcx %rbx,%rbp + adcx %r10,%r15 + mov %rbp,%r13 + shrd $0x34,%r15,%r13 + add -0x20(%rsp),%r13 + mov -0x18(%rsp),%rcx + shrd $0x34,%rcx,%r14 + mov %r9,%rdx + mulx %r14,%r9,%rcx + xor %r10,%r10 + adox %r13,%r9 + adox %r10,%rcx + movabs $0xfffffffffffff,%rdi + mov %r9,%r11 + and %rdi,%r11 + shrd $0x34,%rcx,%r9 + add -0x28(%rsp),%r9 + and %rdi,%r8 + mov %r9,0x20(%r12) + mov %r8,(%r12) + and %rdi,%rbp + mov %rbp,0x10(%r12) + mov %r11,0x18(%r12) + mov -0x80(%rsp),%rbx + mov -0x78(%rsp),%rbp + mov -0x70(%rsp),%r12 + mov -0x68(%rsp),%r13 + mov -0x60(%rsp),%r14 + mov -0x58(%rsp),%r15 + ret diff --git a/src/asm/square.s b/src/asm/square.s new file mode 100644 index 0000000000..54b0898722 --- /dev/null +++ b/src/asm/square.s @@ -0,0 +1,161 @@ +.text +.global secp256k1_fe_sqr_inner +secp256k1_fe_sqr_inner: + mov $0x1,%eax + shlx %rax,0x10(%rsi),%r10 + mov 0x20(%rsi),%rdx + mulx %rdx,%r11,%rcx + shlx %rax,(%rsi),%rdx + mov $0x34,%r8d + bzhi %r8,%r11,%r9 + mov %rbx,-0x80(%rsp) + mulx 0x20(%rsi),%rax,%rbx + mov %rbp,-0x78(%rsp) + mov $0x1,%ebp + mov %r12,-0x70(%rsp) + shlx %rbp,0x8(%rsi),%r12 + mov %rdx,%rbp + mov 0x18(%rsi),%rdx + mov %r13,-0x68(%rsp) + mov %r14,-0x60(%rsp) + mulx %r12,%r13,%r14 + mov (%rsi),%rdx + mov %r15,-0x58(%rsp) + mulx %rdx,%r15,%r8 + mov 0x10(%rsi),%rdx + mov %rdi,-0x50(%rsp) + mov %r8,-0x48(%rsp) + mulx %rdx,%rdi,%r8 + adox %r13,%rdi + adox %r8,%r14 + mov %r12,%rdx + mulx 0x10(%rsi),%r12,%r13 + add %rax,%rdi + adcx %r14,%rbx + xchg %rdx,%rbp + mulx 0x18(%rsi),%rax,%r8 + xor %r14,%r14 + adox %rax,%r12 + adox %r13,%r8 + movabs $0x1000003d10,%r13 + xchg %rdx,%r13 + mulx %r9,%rax,%r14 + shrd $0x34,%rcx,%r11 + xor %rcx,%rcx + adox %r12,%rax + adox %r14,%r8 + mov %rax,%r9 + shrd $0x34,%r8,%r9 + test %al,%al + adox %rdi,%r9 + adox %rcx,%rbx + mulx %r11,%rdi,%r12 + mov 0x20(%rsi),%rdx + mulx %rbp,%r14,%r11 + adcx %r9,%rdi + adcx %r12,%rbx + mov 0x18(%rsi),%rdx + mulx %r10,%rbp,%r8 + mov %rdi,%rdx + shrd $0x34,%rbx,%rdx + xor %r9,%r9 + adox %r14,%rbp + adox %r8,%r11 + adcx %rbp,%rdx + adc $0x0,%r11 + mov %rdx,%rcx + mov 0x20(%rsi),%rdx + mulx %r10,%r12,%r14 + mov 0x18(%rsi),%rdx + mulx %rdx,%r10,%rbx + movabs $0xfffffffffffff,%rdx + and %rdx,%rdi + adox %r12,%r10 + adox %rbx,%r14 + mov %rcx,%r8 + shrd $0x34,%r11,%r8 + add %r10,%r8 + adc $0x0,%r14 + and %rdx,%rcx + shl $0x4,%rcx + mov %r8,%rbp + shrd $0x34,%r14,%rbp + mov %rdi,%r11 + shr $0x30,%r11 + lea (%rcx,%r11,1),%rcx + mov %r13,%rdx + mulx 0x8(%rsi),%r13,%r12 + movabs $0x1000003d1,%rbx + xchg %rdx,%rcx + mulx %rbx,%r10,%r14 + mov 0x18(%rsi),%r11 + mov %r11,%rdx + shl %rdx + xor %r11,%r11 + adox %r15,%r10 + adox -0x48(%rsp),%r14 + mulx 0x20(%rsi),%r9,%r15 + movabs $0xfffffffffffff,%rdx + and %rdx,%r8 + movabs $0x1000003d10,%r11 + mov %r11,%rdx + mulx %r8,%r11,%rbx + mov %r10,%r8 + shrd $0x34,%r14,%r8 + add %r13,%r8 + adc $0x0,%r12 + test %al,%al + adox %r8,%r11 + adox %rbx,%r12 + mov %r11,%r13 + shrd $0x34,%r12,%r13 + test %al,%al + adox %r9,%rbp + mov $0x0,%r14d + adox %r14,%r15 + movabs $0xfffffffffffff,%r9 + and %r9,%r11 + mov %rbp,%rbx + shrd $0x34,%r15,%rbx + mulx %rbx,%r8,%r12 + and %r9,%r10 + mov %rcx,%rdx + mulx 0x10(%rsi),%rcx,%r15 + mov -0x50(%rsp),%rdx + mov %r10,(%rdx) + mov %rdx,%rbx + mov 0x8(%rsi),%rdx + mulx %rdx,%r10,%r14 + adox %rcx,%r10 + adox %r14,%r15 + adcx %r10,%r13 + adc $0x0,%r15 + and %r9,%rax + and %r9,%rbp + movabs $0x1000003d10,%rdx + mulx %rbp,%rcx,%r14 + adox %r13,%rcx + adox %r14,%r15 + mov %rcx,%r10 + and %r9,%r10 + shrd $0x34,%r15,%rcx + lea (%rax,%rcx,1),%rax + mov %r10,0x10(%rbx) + add %rax,%r8 + adc $0x0,%r12 + mov %r8,%r13 + shrd $0x34,%r12,%r13 + mov $0x30,%ebp + bzhi %rbp,%rdi,%r14 + lea (%r14,%r13,1),%r14 + mov %r14,0x20(%rbx) + and %r9,%r8 + mov %r11,0x8(%rbx) + mov %r8,0x18(%rbx) + mov -0x80(%rsp),%rbx + mov -0x78(%rsp),%rbp + mov -0x70(%rsp),%r12 + mov -0x68(%rsp),%r13 + mov -0x60(%rsp),%r14 + mov -0x58(%rsp),%r15 + ret diff --git a/src/dettman.h b/src/dettman.h new file mode 100644 index 0000000000..c0efb9221b --- /dev/null +++ b/src/dettman.h @@ -0,0 +1,5 @@ +#include +extern void fiat_secp256k1_dettman_mul(uint64_t *r, const uint64_t *a, + const uint64_t *b); + +extern void fiat_secp256k1_dettman_square(uint64_t *r, const uint64_t *a); diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index a2118044ab..5e38d0a8bf 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -7,496 +7,23 @@ /** * Changelog: * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm + * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel + * multiplication algorithm * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly */ #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H #define SECP256K1_FIELD_INNER5X52_IMPL_H -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); +// #include "dettman.h" +SECP256K1_INLINE static void +secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, + const uint64_t *SECP256K1_RESTRICT b) { + fiat_secp256k1_dettman_mul(r, a, b); } - -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" - - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=m"(tmp1), "=m"(tmp2), "=m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, + const uint64_t *a) { + fiat_secp256k1_dettman_square(r, a); } #endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ diff --git a/src/field_5x52_fiat_dettman_impl.h b/src/field_5x52_fiat_dettman_impl.h new file mode 100644 index 0000000000..93b1682678 --- /dev/null +++ b/src/field_5x52_fiat_dettman_impl.h @@ -0,0 +1,271 @@ +/* Autogenerated: 'src/ExtractionOCaml/dettman_multiplication' --inline --static + * --use-value-barrier secp256k1_dettman 64 5 48 '2^256 - 4294968273' mul square + */ +/* curve description: secp256k1_dettman */ +/* machine_wordsize = 64 (from "64") */ +/* requested operations: mul, square */ +/* n = 5 (from "5") */ +/* last_limb_width = 48 (from "48") */ +/* s-c = 2^256 - [(1, 4294968273)] (from "2^256 - 4294968273") */ +/* inbounds_multiplier: None (from "") */ +/* */ +/* Computed values: */ +/* */ +/* */ + +#include +#if defined(__GNUC__) || defined(__clang__) +#define FIAT_SECP256K1_DETTMAN_FIAT_EXTENSION __extension__ +#define FIAT_SECP256K1_DETTMAN_FIAT_INLINE __inline__ +#else +#define FIAT_SECP256K1_DETTMAN_FIAT_EXTENSION +#define FIAT_SECP256K1_DETTMAN_FIAT_INLINE +#endif + +FIAT_SECP256K1_DETTMAN_FIAT_EXTENSION typedef signed __int128 + fiat_secp256k1_dettman_int128; +FIAT_SECP256K1_DETTMAN_FIAT_EXTENSION typedef unsigned __int128 + fiat_secp256k1_dettman_uint128; + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +/* + * The function fiat_secp256k1_dettman_mul multiplies two field elements. + * + * Postconditions: + * eval out1 mod + * 115792089237316195423570985008687907853269984665640564039457584007908834671663 + * = (eval arg1 * eval arg2) mod + * 115792089237316195423570985008687907853269984665640564039457584007908834671663 + * + * Input Bounds: + * arg1: [[0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> + * 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1fffffffffffe]] arg2: + * [[0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> + * 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1fffffffffffe]] + * Output Bounds: + * out1: [[0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> + * 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x17fffffffffff]] + */ +static FIAT_SECP256K1_DETTMAN_FIAT_INLINE void +fiat_secp256k1_dettman_mul(uint64_t out1[5], const uint64_t arg1[5], + const uint64_t arg2[5]) { + fiat_secp256k1_dettman_uint128 x1; + uint64_t x2; + uint64_t x3; + fiat_secp256k1_dettman_uint128 x4; + uint64_t x5; + uint64_t x6; + fiat_secp256k1_dettman_uint128 x7; + uint64_t x8; + uint64_t x9; + uint64_t x10; + uint64_t x11; + fiat_secp256k1_dettman_uint128 x12; + uint64_t x13; + uint64_t x14; + fiat_secp256k1_dettman_uint128 x15; + uint64_t x16; + uint64_t x17; + fiat_secp256k1_dettman_uint128 x18; + uint64_t x19; + uint64_t x20; + fiat_secp256k1_dettman_uint128 x21; + uint64_t x22; + uint64_t x23; + fiat_secp256k1_dettman_uint128 x24; + uint64_t x25; + uint64_t x26; + fiat_secp256k1_dettman_uint128 x27; + uint64_t x28; + uint64_t x29; + fiat_secp256k1_dettman_uint128 x30; + uint64_t x31; + uint64_t x32; + uint64_t x33; + x1 = ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg2[4])); + x2 = (uint64_t)(x1 >> 52); + x3 = (uint64_t)(x1 & UINT64_C(0xfffffffffffff)); + x4 = ((((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg2[3])) + + (((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg2[2])) + + (((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg2[1])) + + ((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg2[0]))))) + + ((fiat_secp256k1_dettman_uint128)x3 * UINT64_C(0x1000003d10))); + x5 = (uint64_t)(x4 >> 52); + x6 = (uint64_t)(x4 & UINT64_C(0xfffffffffffff)); + x7 = (((((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg2[4])) + + (((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg2[3])) + + (((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg2[2])) + + (((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg2[1])) + + ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg2[0])))))) + + x5) + + ((fiat_secp256k1_dettman_uint128)x2 * UINT64_C(0x1000003d10))); + x8 = (uint64_t)(x7 >> 52); + x9 = (uint64_t)(x7 & UINT64_C(0xfffffffffffff)); + x10 = (x9 >> 48); + x11 = (x9 & UINT64_C(0xffffffffffff)); + x12 = ((((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg2[4])) + + (((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg2[3])) + + (((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg2[2])) + + ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg2[1]))))) + + x8); + x13 = (uint64_t)(x12 >> 52); + x14 = (uint64_t)(x12 & UINT64_C(0xfffffffffffff)); + x15 = (((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg2[0])) + + ((fiat_secp256k1_dettman_uint128)((x14 << 4) + x10) * + UINT64_C(0x1000003d1))); + x16 = (uint64_t)(x15 >> 52); + x17 = (uint64_t)(x15 & UINT64_C(0xfffffffffffff)); + x18 = ((((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg2[4])) + + (((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg2[3])) + + ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg2[2])))) + + x13); + x19 = (uint64_t)(x18 >> 52); + x20 = (uint64_t)(x18 & UINT64_C(0xfffffffffffff)); + x21 = (((((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg2[1])) + + ((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg2[0]))) + + x16) + + ((fiat_secp256k1_dettman_uint128)x20 * UINT64_C(0x1000003d10))); + x22 = (uint64_t)(x21 >> 52); + x23 = (uint64_t)(x21 & UINT64_C(0xfffffffffffff)); + x24 = ((((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg2[4])) + + ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg2[3]))) + + x19); + x25 = (uint64_t)(x24 >> 52); + x26 = (uint64_t)(x24 & UINT64_C(0xfffffffffffff)); + x27 = (((((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg2[2])) + + (((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg2[1])) + + ((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg2[0])))) + + x22) + + ((fiat_secp256k1_dettman_uint128)x26 * UINT64_C(0x1000003d10))); + x28 = (uint64_t)(x27 >> 52); + x29 = (uint64_t)(x27 & UINT64_C(0xfffffffffffff)); + x30 = ((x6 + x28) + + ((fiat_secp256k1_dettman_uint128)x25 * UINT64_C(0x1000003d10))); + x31 = (uint64_t)(x30 >> 52); + x32 = (uint64_t)(x30 & UINT64_C(0xfffffffffffff)); + x33 = (x11 + x31); + out1[0] = x17; + out1[1] = x23; + out1[2] = x29; + out1[3] = x32; + out1[4] = x33; +} + +/* + * The function fiat_secp256k1_dettman_square squares a field element. + * + * Postconditions: + * eval out1 mod + * 115792089237316195423570985008687907853269984665640564039457584007908834671663 + * = (eval arg1 * eval arg1) mod + * 115792089237316195423570985008687907853269984665640564039457584007908834671663 + * + * Input Bounds: + * arg1: [[0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> + * 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1fffffffffffe]] + * Output Bounds: + * out1: [[0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> + * 0x1ffffffffffffe], [0x0 ~> 0x1ffffffffffffe], [0x0 ~> 0x17fffffffffff]] + */ +static FIAT_SECP256K1_DETTMAN_FIAT_INLINE void +fiat_secp256k1_dettman_square(uint64_t out1[5], const uint64_t arg1[5]) { + uint64_t x1; + uint64_t x2; + uint64_t x3; + uint64_t x4; + fiat_secp256k1_dettman_uint128 x5; + uint64_t x6; + uint64_t x7; + fiat_secp256k1_dettman_uint128 x8; + uint64_t x9; + uint64_t x10; + fiat_secp256k1_dettman_uint128 x11; + uint64_t x12; + uint64_t x13; + uint64_t x14; + uint64_t x15; + fiat_secp256k1_dettman_uint128 x16; + uint64_t x17; + uint64_t x18; + fiat_secp256k1_dettman_uint128 x19; + uint64_t x20; + uint64_t x21; + fiat_secp256k1_dettman_uint128 x22; + uint64_t x23; + uint64_t x24; + fiat_secp256k1_dettman_uint128 x25; + uint64_t x26; + uint64_t x27; + fiat_secp256k1_dettman_uint128 x28; + uint64_t x29; + uint64_t x30; + fiat_secp256k1_dettman_uint128 x31; + uint64_t x32; + uint64_t x33; + fiat_secp256k1_dettman_uint128 x34; + uint64_t x35; + uint64_t x36; + uint64_t x37; + x1 = ((arg1[3]) * 0x2); + x2 = ((arg1[2]) * 0x2); + x3 = ((arg1[1]) * 0x2); + x4 = ((arg1[0]) * 0x2); + x5 = ((fiat_secp256k1_dettman_uint128)(arg1[4]) * (arg1[4])); + x6 = (uint64_t)(x5 >> 52); + x7 = (uint64_t)(x5 & UINT64_C(0xfffffffffffff)); + x8 = ((((fiat_secp256k1_dettman_uint128)x4 * (arg1[3])) + + ((fiat_secp256k1_dettman_uint128)x3 * (arg1[2]))) + + ((fiat_secp256k1_dettman_uint128)x7 * UINT64_C(0x1000003d10))); + x9 = (uint64_t)(x8 >> 52); + x10 = (uint64_t)(x8 & UINT64_C(0xfffffffffffff)); + x11 = (((((fiat_secp256k1_dettman_uint128)x4 * (arg1[4])) + + (((fiat_secp256k1_dettman_uint128)x3 * (arg1[3])) + + ((fiat_secp256k1_dettman_uint128)(arg1[2]) * (arg1[2])))) + + x9) + + ((fiat_secp256k1_dettman_uint128)x6 * UINT64_C(0x1000003d10))); + x12 = (uint64_t)(x11 >> 52); + x13 = (uint64_t)(x11 & UINT64_C(0xfffffffffffff)); + x14 = (x13 >> 48); + x15 = (x13 & UINT64_C(0xffffffffffff)); + x16 = ((((fiat_secp256k1_dettman_uint128)x3 * (arg1[4])) + + ((fiat_secp256k1_dettman_uint128)x2 * (arg1[3]))) + + x12); + x17 = (uint64_t)(x16 >> 52); + x18 = (uint64_t)(x16 & UINT64_C(0xfffffffffffff)); + x19 = (((fiat_secp256k1_dettman_uint128)(arg1[0]) * (arg1[0])) + + ((fiat_secp256k1_dettman_uint128)((x18 << 4) + x14) * + UINT64_C(0x1000003d1))); + x20 = (uint64_t)(x19 >> 52); + x21 = (uint64_t)(x19 & UINT64_C(0xfffffffffffff)); + x22 = ((((fiat_secp256k1_dettman_uint128)x2 * (arg1[4])) + + ((fiat_secp256k1_dettman_uint128)(arg1[3]) * (arg1[3]))) + + x17); + x23 = (uint64_t)(x22 >> 52); + x24 = (uint64_t)(x22 & UINT64_C(0xfffffffffffff)); + x25 = ((((fiat_secp256k1_dettman_uint128)x4 * (arg1[1])) + x20) + + ((fiat_secp256k1_dettman_uint128)x24 * UINT64_C(0x1000003d10))); + x26 = (uint64_t)(x25 >> 52); + x27 = (uint64_t)(x25 & UINT64_C(0xfffffffffffff)); + x28 = (((fiat_secp256k1_dettman_uint128)x1 * (arg1[4])) + x23); + x29 = (uint64_t)(x28 >> 52); + x30 = (uint64_t)(x28 & UINT64_C(0xfffffffffffff)); + x31 = (((((fiat_secp256k1_dettman_uint128)x4 * (arg1[2])) + + ((fiat_secp256k1_dettman_uint128)(arg1[1]) * (arg1[1]))) + + x26) + + ((fiat_secp256k1_dettman_uint128)x30 * UINT64_C(0x1000003d10))); + x32 = (uint64_t)(x31 >> 52); + x33 = (uint64_t)(x31 & UINT64_C(0xfffffffffffff)); + x34 = ((x10 + x32) + + ((fiat_secp256k1_dettman_uint128)x29 * UINT64_C(0x1000003d10))); + x35 = (uint64_t)(x34 >> 52); + x36 = (uint64_t)(x34 & UINT64_C(0xfffffffffffff)); + x37 = (x15 + x35); + out1[0] = x21; + out1[1] = x27; + out1[2] = x33; + out1[3] = x36; + out1[4] = x37; +} diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h index 4c4466eceb..8d622bdea5 100644 --- a/src/field_5x52_impl.h +++ b/src/field_5x52_impl.h @@ -12,8 +12,10 @@ #include "field.h" #include "modinv64_impl.h" -#if defined(USE_ASM_X86_64) -#include "field_5x52_asm_impl.h" +#if defined(USE_EXTERNAL_ASM) && defined(USE_ASM_X86_64) +extern void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t *b); +extern void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a); +#pragma message "ASM enabled" #else #include "field_5x52_int128_impl.h" #endif diff --git a/src/field_5x52_int128_impl.h b/src/field_5x52_int128_impl.h index 18567b95f3..753f9db9eb 100644 --- a/src/field_5x52_int128_impl.h +++ b/src/field_5x52_int128_impl.h @@ -9,270 +9,17 @@ #include -#include "int128.h" +#include "field_5x52_fiat_dettman_impl.h" -#ifdef VERIFY -#define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0) -#define VERIFY_BITS_128(x, n) VERIFY_CHECK(secp256k1_u128_check_bits((x), (n))) -#else -#define VERIFY_BITS(x, n) do { } while(0) -#define VERIFY_BITS_128(x, n) do { } while(0) -#endif - -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { - secp256k1_uint128 c, d; - uint64_t t3, t4, tx, u0; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; - - VERIFY_BITS(a[0], 56); - VERIFY_BITS(a[1], 56); - VERIFY_BITS(a[2], 56); - VERIFY_BITS(a[3], 56); - VERIFY_BITS(a[4], 52); - VERIFY_BITS(b[0], 56); - VERIFY_BITS(b[1], 56); - VERIFY_BITS(b[2], 56); - VERIFY_BITS(b[3], 56); - VERIFY_BITS(b[4], 52); - VERIFY_CHECK(r != b); - VERIFY_CHECK(a != b); - - /* [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. - * for 0 <= x <= 4, px is a shorthand for sum(a[i]*b[x-i], i=0..x). - * for 4 <= x <= 8, px is a shorthand for sum(a[i]*b[x-i], i=(x-4)..4) - * Note that [x 0 0 0 0 0] = [x*R]. - */ - - secp256k1_u128_mul(&d, a0, b[3]); - secp256k1_u128_accum_mul(&d, a1, b[2]); - secp256k1_u128_accum_mul(&d, a2, b[1]); - secp256k1_u128_accum_mul(&d, a3, b[0]); - VERIFY_BITS_128(&d, 114); - /* [d 0 0 0] = [p3 0 0 0] */ - secp256k1_u128_mul(&c, a4, b[4]); - VERIFY_BITS_128(&c, 112); - /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64); - VERIFY_BITS_128(&d, 115); - VERIFY_BITS_128(&c, 48); - /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(t3, 52); - VERIFY_BITS_128(&d, 63); - /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - - secp256k1_u128_accum_mul(&d, a0, b[4]); - secp256k1_u128_accum_mul(&d, a1, b[3]); - secp256k1_u128_accum_mul(&d, a2, b[2]); - secp256k1_u128_accum_mul(&d, a3, b[1]); - secp256k1_u128_accum_mul(&d, a4, b[0]); - VERIFY_BITS_128(&d, 115); - /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c)); - VERIFY_BITS_128(&d, 116); - /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(t4, 52); - VERIFY_BITS_128(&d, 64); - /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - tx = (t4 >> 48); t4 &= (M >> 4); - VERIFY_BITS(tx, 4); - VERIFY_BITS(t4, 48); - /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - - secp256k1_u128_mul(&c, a0, b[0]); - VERIFY_BITS_128(&c, 112); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */ - secp256k1_u128_accum_mul(&d, a1, b[4]); - secp256k1_u128_accum_mul(&d, a2, b[3]); - secp256k1_u128_accum_mul(&d, a3, b[2]); - secp256k1_u128_accum_mul(&d, a4, b[1]); - VERIFY_BITS_128(&d, 115); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(u0, 52); - VERIFY_BITS_128(&d, 63); - /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = (u0 << 4) | tx; - VERIFY_BITS(u0, 56); - /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - secp256k1_u128_accum_mul(&c, u0, R >> 4); - VERIFY_BITS_128(&c, 115); - /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[0], 52); - VERIFY_BITS_128(&c, 61); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */ - - secp256k1_u128_accum_mul(&c, a0, b[1]); - secp256k1_u128_accum_mul(&c, a1, b[0]); - VERIFY_BITS_128(&c, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */ - secp256k1_u128_accum_mul(&d, a2, b[4]); - secp256k1_u128_accum_mul(&d, a3, b[3]); - secp256k1_u128_accum_mul(&d, a4, b[2]); - VERIFY_BITS_128(&d, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52); - VERIFY_BITS_128(&c, 115); - VERIFY_BITS_128(&d, 62); - /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[1], 52); - VERIFY_BITS_128(&c, 63); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - - secp256k1_u128_accum_mul(&c, a0, b[2]); - secp256k1_u128_accum_mul(&c, a1, b[1]); - secp256k1_u128_accum_mul(&c, a2, b[0]); - VERIFY_BITS_128(&c, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */ - secp256k1_u128_accum_mul(&d, a3, b[4]); - secp256k1_u128_accum_mul(&d, a4, b[3]); - VERIFY_BITS_128(&d, 114); - /* [d 0 0 t4 t3 c t1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64); - VERIFY_BITS_128(&c, 115); - VERIFY_BITS_128(&d, 50); - /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - - r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[2], 52); - VERIFY_BITS_128(&c, 63); - /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d)); - secp256k1_u128_accum_u64(&c, t3); - VERIFY_BITS_128(&c, 100); - /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[3], 52); - VERIFY_BITS_128(&c, 48); - /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[4] = secp256k1_u128_to_u64(&c) + t4; - VERIFY_BITS(r[4], 49); - /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ +SECP256K1_INLINE static void +secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, + const uint64_t *SECP256K1_RESTRICT b) { + fiat_secp256k1_dettman_mul(r, a, b); } -SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { - secp256k1_uint128 c, d; - uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4]; - int64_t t3, t4, tx, u0; - const uint64_t M = 0xFFFFFFFFFFFFFULL, R = 0x1000003D10ULL; - - VERIFY_BITS(a[0], 56); - VERIFY_BITS(a[1], 56); - VERIFY_BITS(a[2], 56); - VERIFY_BITS(a[3], 56); - VERIFY_BITS(a[4], 52); - - /** [... a b c] is a shorthand for ... + a<<104 + b<<52 + c<<0 mod n. - * px is a shorthand for sum(a[i]*a[x-i], i=0..x). - * Note that [x 0 0 0 0 0] = [x*R]. - */ - - secp256k1_u128_mul(&d, a0*2, a3); - secp256k1_u128_accum_mul(&d, a1*2, a2); - VERIFY_BITS_128(&d, 114); - /* [d 0 0 0] = [p3 0 0 0] */ - secp256k1_u128_mul(&c, a4, a4); - VERIFY_BITS_128(&c, 112); - /* [c 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - secp256k1_u128_accum_mul(&d, R, secp256k1_u128_to_u64(&c)); secp256k1_u128_rshift(&c, 64); - VERIFY_BITS_128(&d, 115); - VERIFY_BITS_128(&c, 48); - /* [(c<<12) 0 0 0 0 0 d 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - t3 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(t3, 52); - VERIFY_BITS_128(&d, 63); - /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 0 p3 0 0 0] */ - - a4 *= 2; - secp256k1_u128_accum_mul(&d, a0, a4); - secp256k1_u128_accum_mul(&d, a1*2, a3); - secp256k1_u128_accum_mul(&d, a2, a2); - VERIFY_BITS_128(&d, 115); - /* [(c<<12) 0 0 0 0 d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - secp256k1_u128_accum_mul(&d, R << 12, secp256k1_u128_to_u64(&c)); - VERIFY_BITS_128(&d, 116); - /* [d t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - t4 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(t4, 52); - VERIFY_BITS_128(&d, 64); - /* [d t4 t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - tx = (t4 >> 48); t4 &= (M >> 4); - VERIFY_BITS(tx, 4); - VERIFY_BITS(t4, 48); - /* [d t4+(tx<<48) t3 0 0 0] = [p8 0 0 0 p4 p3 0 0 0] */ - - secp256k1_u128_mul(&c, a0, a0); - VERIFY_BITS_128(&c, 112); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 0 p4 p3 0 0 p0] */ - secp256k1_u128_accum_mul(&d, a1, a4); - secp256k1_u128_accum_mul(&d, a2*2, a3); - VERIFY_BITS_128(&d, 114); - /* [d t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = secp256k1_u128_to_u64(&d) & M; secp256k1_u128_rshift(&d, 52); - VERIFY_BITS(u0, 52); - VERIFY_BITS_128(&d, 62); - /* [d u0 t4+(tx<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - /* [d 0 t4+(tx<<48)+(u0<<52) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - u0 = (u0 << 4) | tx; - VERIFY_BITS(u0, 56); - /* [d 0 t4+(u0<<48) t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - secp256k1_u128_accum_mul(&c, u0, R >> 4); - VERIFY_BITS_128(&c, 113); - /* [d 0 t4 t3 0 0 c] = [p8 0 0 p5 p4 p3 0 0 p0] */ - r[0] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[0], 52); - VERIFY_BITS_128(&c, 61); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 0 p0] */ - - a0 *= 2; - secp256k1_u128_accum_mul(&c, a0, a1); - VERIFY_BITS_128(&c, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 0 p5 p4 p3 0 p1 p0] */ - secp256k1_u128_accum_mul(&d, a2, a4); - secp256k1_u128_accum_mul(&d, a3, a3); - VERIFY_BITS_128(&d, 114); - /* [d 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - secp256k1_u128_accum_mul(&c, secp256k1_u128_to_u64(&d) & M, R); secp256k1_u128_rshift(&d, 52); - VERIFY_BITS_128(&c, 115); - VERIFY_BITS_128(&d, 62); - /* [d 0 0 t4 t3 0 c r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - r[1] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[1], 52); - VERIFY_BITS_128(&c, 63); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 0 p1 p0] */ - - secp256k1_u128_accum_mul(&c, a0, a2); - secp256k1_u128_accum_mul(&c, a1, a1); - VERIFY_BITS_128(&c, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 0 p6 p5 p4 p3 p2 p1 p0] */ - secp256k1_u128_accum_mul(&d, a3, a4); - VERIFY_BITS_128(&d, 114); - /* [d 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - secp256k1_u128_accum_mul(&c, R, secp256k1_u128_to_u64(&d)); secp256k1_u128_rshift(&d, 64); - VERIFY_BITS_128(&c, 115); - VERIFY_BITS_128(&d, 50); - /* [(d<<12) 0 0 0 t4 t3 c r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[2] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[2], 52); - VERIFY_BITS_128(&c, 63); - /* [(d<<12) 0 0 0 t4 t3+c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - - secp256k1_u128_accum_mul(&c, R << 12, secp256k1_u128_to_u64(&d)); - secp256k1_u128_accum_u64(&c, t3); - VERIFY_BITS_128(&c, 100); - /* [t4 c r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[3] = secp256k1_u128_to_u64(&c) & M; secp256k1_u128_rshift(&c, 52); - VERIFY_BITS(r[3], 52); - VERIFY_BITS_128(&c, 48); - /* [t4+c r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ - r[4] = secp256k1_u128_to_u64(&c) + t4; - VERIFY_BITS(r[4], 49); - /* [r4 r3 r2 r1 r0] = [p8 p7 p6 p5 p4 p3 p2 p1 p0] */ +SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, + const uint64_t *a) { + fiat_secp256k1_dettman_square(r, a); } #endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */