Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use 64b type for add/mul of field elements on CPU #588

Merged
merged 1 commit into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 156 additions & 37 deletions icicle/include/icicle/fields/host_math.h
Original file line number Diff line number Diff line change
@@ -1,56 +1,75 @@
// Note: this optimization generates invalid code (using gcc) when storage class has a union for both u32 and u64 so disabling it.
#if defined(__GNUC__) && !defined(__NVCC__) && !defined(__clang__)
#pragma GCC optimize("no-strict-aliasing")
#endif

#pragma once

#include <cstdint>
#include "icicle/utils/modifiers.h"
#include "icicle/fields/storage.h"
namespace host_math {

// return x + y with uint32_t operands
static constexpr __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
// return x + y with T operands
template <typename T>
static constexpr __host__ T add(const T x, const T y)
{
return x + y;
}

// return x + y + carry with uint32_t operands
static constexpr __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry)
// return x + y + carry with T operands
template <typename T>
static constexpr __host__ T addc(const T x, const T y, const T carry)
{
return x + y + carry;
}

// return x + y and carry out with uint32_t operands
static constexpr __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
// return x + y and carry out with T operands
template <typename T>
static constexpr __host__ T add_cc(const T x, const T y, T& carry)
{
uint32_t result = x + y;
T result = x + y;
carry = x > result;
return result;
}

// return x + y + carry and carry out with uint32_t operands
static constexpr __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
// return x + y + carry and carry out with T operands
template <typename T>
static constexpr __host__ T addc_cc(const T x, const T y, T& carry)
{
const uint32_t result = x + y + carry;
const T result = x + y + carry;
carry = carry && x >= result || !carry && x > result;
return result;
}

// return x - y with uint32_t operands
static constexpr __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
// return x - y with T operands
template <typename T>
static constexpr __host__ T sub(const T x, const T y)
{
return x - y;
}

// return x - y - borrow with uint32_t operands
static constexpr __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow)
// return x - y - borrow with T operands
template <typename T>
static constexpr __host__ T subc(const T x, const T y, const T borrow)
{
return x - y - borrow;
}

// return x - y and borrow out with uint32_t operands
static constexpr __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
// return x - y and borrow out with T operands
template <typename T>
static constexpr __host__ T sub_cc(const T x, const T y, T& borrow)
{
uint32_t result = x - y;
T result = x - y;
borrow = x < result;
return result;
}

// return x - y - borrow and borrow out with uint32_t operands
static constexpr __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
// return x - y - borrow and borrow out with T operands
template <typename T>
static constexpr __host__ T subc_cc(const T x, const T y, T& borrow)
{
const uint32_t result = x - y - borrow;
const T result = x - y - borrow;
borrow = borrow && x <= result || !borrow && x < result;
return result;
}
Expand All @@ -64,13 +83,22 @@ namespace host_math {
return result;
}

static constexpr __host__ uint64_t madc_cc_64(const uint64_t x, const uint64_t y, const uint64_t z, uint64_t& carry)
{
__uint128_t r = static_cast<__uint128_t>(x) * y + z + carry;
carry = (uint64_t)(r >> 64);
uint64_t result = r & 0xffffffffffffffff;
return result;
}

template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
struct carry_chain {
unsigned index;

constexpr HOST_INLINE carry_chain() : index(0) {}

constexpr HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
template <typename T>
constexpr HOST_INLINE T add(const T x, const T y, T& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
Expand All @@ -83,7 +111,8 @@ namespace host_math {
return host_math::addc(x, y, carry);
}

constexpr HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
template <typename T>
constexpr HOST_INLINE T sub(const T x, const T y, T& carry)
{
index++;
if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
Expand All @@ -97,9 +126,55 @@ namespace host_math {
}
};

template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t add_sub_limbs_32(const uint32_t* x, const uint32_t* y, uint32_t* r)
{
uint32_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
for (unsigned i = 0; i < NLIMBS; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
}

template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint64_t add_sub_limbs_64(const uint64_t* x, const uint64_t* y, uint64_t* r)
{
uint64_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
for (unsigned i = 0; i < NLIMBS / 2; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
}

template <
unsigned NLIMBS,
bool SUBTRACT,
bool CARRY_OUT,
bool USE_32 = true> // for now we use only the 32 add/sub because the casting of the carry causes problems when
// compiling in release. to solve this we need to entirely split the field functions between a
// host version and a device version.
static constexpr HOST_INLINE uint32_t // 32 is enough for the carry
add_sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
{
if constexpr (USE_32 || NLIMBS < 2) {
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
return add_sub_limbs_32<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
} else {
const uint64_t* x = xs.limbs64;
const uint64_t* y = ys.limbs64;
uint64_t* r = rs.limbs64;
// Note: returns uint64 but uint 32 is enough.
uint64_t result = add_sub_limbs_64<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
uint32_t carry = result == 1;
return carry;
}
}

template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
static constexpr HOST_INLINE void
multiply_raw(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
multiply_raw_32(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
{
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
Expand All @@ -112,18 +187,54 @@ namespace host_math {
}
}

template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
static constexpr HOST_INLINE uint32_t
add_sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r)
{
const uint32_t* x = xs.limbs;
const uint32_t* y = ys.limbs;
uint32_t* r = rs.limbs;
uint32_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
for (unsigned i = 0; i < NLIMBS; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
for (unsigned i = 0; i < NLIMBS_B / 2; i++) {
uint64_t carry = 0;
for (unsigned j = 0; j < NLIMBS_A / 2; j++)
r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry);
r[NLIMBS_A / 2 + i] = carry;
}
}

template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
static HOST_INLINE void
multiply_raw_64(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
{
const uint64_t* a = as.limbs64;
const uint64_t* b = bs.limbs64;
uint64_t* r = rs.limbs64;
multiply_raw_64<NLIMBS_A, NLIMBS_B>(a, b, r);
}

template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A, bool USE_32 = false>
static constexpr HOST_INLINE void
multiply_raw(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
{
static_assert(
(NLIMBS_A % 2 == 0 || NLIMBS_A == 1) && (NLIMBS_B % 2 == 0 || NLIMBS_B == 1),
"odd number of limbs is not supported\n");
if constexpr (USE_32) {
multiply_raw_32<NLIMBS_A, NLIMBS_B>(as, bs, rs);
return;
} else if constexpr ((NLIMBS_A == 1 && NLIMBS_B == 2) || (NLIMBS_A == 2 && NLIMBS_B == 1)) {
multiply_raw_32<NLIMBS_A, NLIMBS_B>(as, bs, rs);
return;
} else if constexpr (NLIMBS_A == 1 && NLIMBS_B == 1) {
rs.limbs[1] = 0;
rs.limbs[0] = host_math::madc_cc(as.limbs[0], bs.limbs[0], 0, rs.limbs[1]);
return;
} else if constexpr (NLIMBS_A == 2 && NLIMBS_B == 2) {
const uint64_t* a = as.limbs64; // nof limbs should be even
const uint64_t* b = bs.limbs64;
uint64_t* r = rs.limbs64;
r[1] = 0;
r[0] = host_math::madc_cc_64(a[0], b[0], 0, r[1]);
return;
} else {
multiply_raw_64<NLIMBS_A, NLIMBS_B>(as, bs, rs);
}
}

template <unsigned NLIMBS, unsigned BITS>
Expand Down Expand Up @@ -162,7 +273,11 @@ namespace host_math {
}
}

template <unsigned NLIMBS_NUM, unsigned NLIMBS_DENOM, unsigned NLIMBS_Q = (NLIMBS_NUM - NLIMBS_DENOM)>
template <
unsigned NLIMBS_NUM,
unsigned NLIMBS_DENOM,
unsigned NLIMBS_Q = (NLIMBS_NUM - NLIMBS_DENOM),
bool USE_32 = false>
static constexpr HOST_INLINE void integer_division(
const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
{
Expand All @@ -171,12 +286,16 @@ namespace host_math {
for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
r = left_shift<NLIMBS_DENOM, 1>(r);
r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true>(r, denom, temp);
if ((limb_idx < NLIMBS_Q) & !c) {
uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true, USE_32>(r, denom, temp);
if (limb_idx < NLIMBS_Q & !c) {
r = temp;
q.limbs[limb_idx] |= 1 << bit_idx;
}
}
}
}
} // namespace host_math

#if defined(__GNUC__) && !defined(__NVCC__) && !defined(__clang__)
#pragma GCC reset_options
#endif
20 changes: 10 additions & 10 deletions icicle/include/icicle/fields/params_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace params_gen {
static constexpr HOST_INLINE storage<2 * NLIMBS> get_square(const storage<NLIMBS>& xs)
{
storage<2 * NLIMBS> rs = {};
host_math::template multiply_raw<NLIMBS>(xs, xs, rs);
host_math::template multiply_raw<NLIMBS, NLIMBS, true>(xs, xs, rs);
return host_math::template left_shift<2 * NLIMBS, BIT_SHIFT>(rs);
}

Expand All @@ -17,7 +17,7 @@ namespace params_gen {
get_difference_no_carry(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys)
{
storage<NLIMBS> rs = {};
host_math::template add_sub_limbs<NLIMBS, true, false>(xs, ys, rs);
host_math::template add_sub_limbs<NLIMBS, true, false, true>(xs, ys, rs);
return rs;
}

Expand All @@ -28,7 +28,7 @@ namespace params_gen {
storage<NLIMBS> qs = {};
storage<2 * NLIMBS> wide_one = {1};
storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, EXP>(wide_one);
host_math::template integer_division<2 * NLIMBS, NLIMBS>(pow_of_2, modulus, qs, rs);
host_math::template integer_division<2 * NLIMBS, NLIMBS, NLIMBS, true>(pow_of_2, modulus, qs, rs);
return qs;
}

Expand All @@ -38,12 +38,12 @@ namespace params_gen {
storage<NLIMBS> rs = {1};
for (int i = 0; i < 32 * NLIMBS; i++) {
if (INV) {
if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false, true>(rs, modulus, rs);
rs = host_math::template right_shift<NLIMBS, 1>(rs);
} else {
rs = host_math::template left_shift<NLIMBS, 1>(rs);
storage<NLIMBS> temp = {};
rs = host_math::template add_sub_limbs<NLIMBS, true, true>(rs, modulus, temp) ? rs : temp;
rs = host_math::template add_sub_limbs<NLIMBS, true, true, true>(rs, modulus, temp) ? rs : temp;
}
}
return rs;
Expand All @@ -57,12 +57,12 @@ namespace params_gen {
storage<2 * NLIMBS> x1 = {};
storage<3 * NLIMBS> x2 = {};
storage<3 * NLIMBS> x3 = {};
host_math::template multiply_raw<NLIMBS>(modulus, m, x1);
host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, x1, x2);
host_math::template multiply_raw<NLIMBS, NLIMBS, true>(modulus, m, x1);
host_math::template multiply_raw<NLIMBS, 2 * NLIMBS, true>(modulus, x1, x2);
storage<2 * NLIMBS> one = {1};
storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, NBITS>(one);
host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, pow_of_2, x3);
host_math::template add_sub_limbs<3 * NLIMBS, true, false>(x3, x2, x2);
host_math::template multiply_raw<NLIMBS, 2 * NLIMBS, true>(modulus, pow_of_2, x3);
host_math::template add_sub_limbs<3 * NLIMBS, true, false, true>(x3, x2, x2);
double err = (double)x2.limbs[2 * NLIMBS - 1] / pow_of_2.limbs[2 * NLIMBS - 1];
err += (double)m.limbs[NLIMBS - 1] / 0xffffffff;
err += (double)NLIMBS / 0x80000000;
Expand All @@ -87,7 +87,7 @@ namespace params_gen {
storage_array<TWO_ADICITY, NLIMBS> invs = {};
storage<NLIMBS> rs = {1};
for (int i = 0; i < TWO_ADICITY; i++) {
if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false, true>(rs, modulus, rs);
rs = host_math::template right_shift<NLIMBS, 1>(rs);
invs.storages[i] = rs;
}
Expand Down
Loading
Loading