Skip to content

Commit

Permalink
performance: (~3x) optimized field multiplier and EC-adder for improv…
Browse files Browse the repository at this point in the history
…ed MSM and ECNTT (#693)

- fix: ECNTT 5<logn<14 was single threaded. Now multi threaded.
- This PR is using inlining and loop unrolling to improve CPU for: scalar multiplication, scalar * ECpoint. EC addition and consequently ECNTT and and MSM.
  • Loading branch information
yshekel authored Dec 12, 2024
1 parent da16082 commit bef4371
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 54 deletions.
22 changes: 17 additions & 5 deletions icicle/backend/cpu/include/ntt_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
#include "icicle/utils/log.h"
#include "ntt_tasks_manager.h"
#include "ntt_utils.h"
// #include <_types/_uint32_t.h>
#include <cstdint>
#include <deque>

#ifdef CURVE_ID
#include "icicle/curves/curve_config.h"
using namespace curve_config;
#define IS_ECNTT std::is_same_v<E, projective_t>
#else
#define IS_ECNTT false
#endif

using namespace field_config;
using namespace icicle;

Expand Down Expand Up @@ -464,10 +471,15 @@ namespace ntt_cpu {
bool NttCpu<S, E>::compute_if_is_parallel(uint32_t logn, const NTTConfig<S>& config)
{
uint32_t log_batch_size = uint32_t(log2(config.batch_size));
uint32_t scalar_size = sizeof(S);
// for small scalars, the threshold for when it is faster to use parallel NTT is higher
if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
return false;
// For ecntt we want parallelism unless really small case
if constexpr (IS_ECNTT) {
return logn > 5;
} else {
uint32_t scalar_size = sizeof(S);
// for small scalars, the threshold for when it is faster to use parallel NTT is higher
if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
return false;
}
}
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion icicle/include/icicle/curves/projective.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ class Projective

Projective res = zero();

const int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
constexpr int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
bool res_is_not_zero = false;
for (int w = nof_windows - 1; w >= 0; w -= 1) {
// Extract the next window_size bits from the scalar
Expand Down
92 changes: 49 additions & 43 deletions icicle/include/icicle/fields/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,49 +160,48 @@ class Field
Wide out{};
#ifdef __CUDA_ARCH__
UNROLL
#else
#pragma unroll
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
return out;
}

static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
return out;
}

static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
// WARNING: taking views is zero copy but unsafe
constexpr const Field& get_lower_view() const { return *reinterpret_cast<const Field*>(limbs_storage.limbs); }
constexpr const Field& get_higher_view() const
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
return out;
return *reinterpret_cast<const Field*>(limbs_storage.limbs + TLC);
}

// This is not zero copy
static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++) {
#ifdef __CUDA_ARCH__
out.limbs_storage.limbs[i] =
__funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits);
}
#else
out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
(xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
#endif
// CPU: for even number of limbs, read and shift 64b limbs, otherwise 32b
if constexpr (TLC % 2 == 0) {
#pragma unroll
for (unsigned i = 0; i < TLC / 2; i++) { // Ensure valid indexing
out.limbs_storage.limbs64[i] = (xs.limbs_storage.limbs64[i + TLC / 2] << 2 * slack_bits) |
(xs.limbs_storage.limbs64[i + TLC / 2 - 1] >> (64 - 2 * slack_bits));
}
} else {
#pragma unroll
for (unsigned i = 0; i < TLC; i++) { // Ensure valid indexing
out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
(xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
}
}
#endif

return out;
}

Expand Down Expand Up @@ -440,7 +439,7 @@ class Field
}

static DEVICE_INLINE uint32_t
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, const uint32_t* extra, size_t n = (TLC >> 1))
{
acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);

Expand Down Expand Up @@ -505,12 +504,12 @@ class Field
* limb products are included.
*/
static DEVICE_INLINE void
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
{
ff_storage bs = get_neg_modulus();
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* c = cs.limbs;
const uint32_t* c = cs.limbs;
uint32_t* even = rs.limbs;

if constexpr (TLC > 2) {
Expand Down Expand Up @@ -674,15 +673,15 @@ class Field
}

static HOST_DEVICE_INLINE void
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
{
#ifdef __CUDA_ARCH__
return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
#else
Wide r_wide = {};
host_math::template multiply_raw<TLC>(as, get_neg_modulus(), r_wide.limbs_storage);
Field r = Wide::get_lower(r_wide);
add_limbs<TLC, false>(cs, r.limbs_storage, rs);
const Field& r_low_view = r_wide.get_lower_view();
add_limbs<TLC, false>(cs, r_low_view.limbs_storage, rs);
#endif
}

Expand Down Expand Up @@ -806,16 +805,17 @@ class Field
Field xs_hi = Wide::get_higher_with_slack(xs);
Wide l = {};
multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m`
Field l_hi = Wide::get_higher(l);
// Note: taking views is zero copy but unsafe
const Field& l_hi = l.get_higher_view();
const Field& xs_lo = xs.get_lower_view();
Field r = {};
Field xs_lo = Wide::get_lower(xs);
// Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
// `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage);
ff_storage r_reduced = {};
uint32_t carry = 0;
// As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
if (num_of_reductions() == 2) {
if constexpr (num_of_reductions() == 2) {
carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<2>(), r_reduced);
if (carry == 0) r = Field{r_reduced};
}
Expand All @@ -827,6 +827,7 @@ class Field

HOST_DEVICE Field& operator=(Field const& other)
{
#pragma unroll
for (int i = 0; i < TLC; i++) {
this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
}
Expand All @@ -850,6 +851,7 @@ class Field
limbs_or |= x[i] ^ y[i];
return limbs_or == 0;
#else
#pragma unroll
for (unsigned i = 0; i < TLC; i++)
if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false;
return true;
Expand All @@ -861,16 +863,18 @@ class Field
template <const Field& multiplier>
static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
{
Field mul = multiplier;
static bool is_u32 = true;
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 1; i < TLC; i++)
is_u32 &= (mul.limbs_storage.limbs[i] == 0);

if (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
return mul * xs;
constexpr bool is_u32 = []() {
bool is_u32 = true;
for (unsigned i = 1; i < TLC; i++)
is_u32 &= (multiplier.limbs_storage.limbs[i] == 0);
return is_u32;
}();

if constexpr (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);

// This is not really a copy but required for CUDA compilation since the template param is not in the device memory
Field mult = multiplier;
return mult * xs;
}

template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
Expand All @@ -881,6 +885,8 @@ class Field
bool is_zero = true;
#ifdef __CUDA_ARCH__
UNROLL
#else
#pragma unroll
#endif
for (unsigned i = 0; i < 32; i++) {
if (multiplier & (1 << i)) {
Expand Down
10 changes: 10 additions & 0 deletions icicle/include/icicle/fields/host_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ namespace host_math {
{
uint32_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
#pragma unroll
for (unsigned i = 0; i < NLIMBS; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
Expand All @@ -142,6 +143,7 @@ namespace host_math {
{
uint64_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
#pragma unroll
for (unsigned i = 0; i < NLIMBS / 2; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
Expand Down Expand Up @@ -178,8 +180,10 @@ namespace host_math {
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* r = rs.limbs;
#pragma unroll
for (unsigned i = 0; i < NLIMBS_B; i++) {
uint32_t carry = 0;
#pragma unroll
for (unsigned j = 0; j < NLIMBS_A; j++)
r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
r[NLIMBS_A + i] = carry;
Expand All @@ -189,8 +193,10 @@ namespace host_math {
template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r)
{
#pragma unroll
for (unsigned i = 0; i < NLIMBS_B / 2; i++) {
uint64_t carry = 0;
#pragma unroll
for (unsigned j = 0; j < NLIMBS_A / 2; j++)
r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry);
r[NLIMBS_A / 2 + i] = carry;
Expand Down Expand Up @@ -247,6 +253,7 @@ namespace host_math {
storage<NLIMBS> out{};
if constexpr (LIMBS_GAP < NLIMBS) {
out.limbs[LIMBS_GAP] = xs.limbs[0] << BITS32;
#pragma unroll
for (unsigned i = 1; i < NLIMBS - LIMBS_GAP; i++)
out.limbs[i + LIMBS_GAP] = (xs.limbs[i] << BITS32) + (xs.limbs[i - 1] >> (32 - BITS32));
}
Expand All @@ -264,6 +271,7 @@ namespace host_math {
constexpr unsigned LIMBS_GAP = BITS / 32;
storage<NLIMBS> out{};
if constexpr (LIMBS_GAP < NLIMBS - 1) {
#pragma unroll
for (unsigned i = 0; i < NLIMBS - LIMBS_GAP - 1; i++)
out.limbs[i] = (xs.limbs[i + LIMBS_GAP] >> BITS32) + (xs.limbs[i + LIMBS_GAP + 1] << (32 - BITS32));
}
Expand All @@ -281,7 +289,9 @@ namespace host_math {
const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
{
storage<NLIMBS_DENOM> temp = {};
#pragma unroll
for (int limb_idx = NLIMBS_NUM - 1; limb_idx >= 0; limb_idx--) {
#pragma unroll
for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
r = left_shift<NLIMBS_DENOM, 1>(r);
r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
Expand Down
9 changes: 4 additions & 5 deletions icicle/include/icicle/utils/modifiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@
#define DEVICE_INLINE __device__ INLINE_MACRO
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO
#else // not CUDA
#define INLINE_MACRO
#else // not NVCC
#define UNROLL
#define HOST_INLINE
#define DEVICE_INLINE
#define HOST_DEVICE
#define HOST_DEVICE_INLINE
#define HOST_INLINE __attribute__((always_inline))
#define DEVICE_INLINE
#define HOST_DEVICE_INLINE HOST_INLINE
#define __host__
#define __device__
#endif
Expand Down

0 comments on commit bef4371

Please sign in to comment.