Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

performance: (~3x) optimize field multiplier and EC-adder for improved MSM and ECNTT #693

Merged
merged 2 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions icicle/backend/cpu/include/ntt_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
#include "icicle/utils/log.h"
#include "ntt_tasks_manager.h"
#include "ntt_utils.h"
// #include <_types/_uint32_t.h>
#include <cstdint>
#include <deque>

#ifdef CURVE_ID
#include "icicle/curves/curve_config.h"
using namespace curve_config;
#define IS_ECNTT std::is_same_v<E, projective_t>
#else
#define IS_ECNTT false
#endif

using namespace field_config;
using namespace icicle;

Expand Down Expand Up @@ -464,10 +471,15 @@ namespace ntt_cpu {
bool NttCpu<S, E>::compute_if_is_parallel(uint32_t logn, const NTTConfig<S>& config)
{
uint32_t log_batch_size = uint32_t(log2(config.batch_size));
uint32_t scalar_size = sizeof(S);
// for small scalars, the threshold for when it is faster to use parallel NTT is higher
if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
return false;
// For ecntt we want parallelism unless really small case
if constexpr (IS_ECNTT) {
return logn > 5;
} else {
uint32_t scalar_size = sizeof(S);
// for small scalars, the threshold for when it is faster to use parallel NTT is higher
if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a question: does this depend on cpu type?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return false;
}
}
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion icicle/include/icicle/curves/projective.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ class Projective

Projective res = zero();

const int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
constexpr int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
bool res_is_not_zero = false;
for (int w = nof_windows - 1; w >= 0; w -= 1) {
// Extract the next window_size bits from the scalar
Expand Down
92 changes: 49 additions & 43 deletions icicle/include/icicle/fields/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,49 +160,48 @@ class Field
Wide out{};
#ifdef __CUDA_ARCH__
UNROLL
#else
#pragma unroll
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
return out;
}

static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
return out;
}

static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
// WARNING: taking views is zero copy but unsafe
constexpr const Field& get_lower_view() const { return *reinterpret_cast<const Field*>(limbs_storage.limbs); }
constexpr const Field& get_higher_view() const
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++)
out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
return out;
return *reinterpret_cast<const Field*>(limbs_storage.limbs + TLC);
}

// This is not zero copy
static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
{
Field out{};
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 0; i < TLC; i++) {
#ifdef __CUDA_ARCH__
out.limbs_storage.limbs[i] =
__funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits);
}
#else
out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
(xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
#endif
// CPU: for even number of limbs, read and shift 64b limbs, otherwise 32b
if constexpr (TLC % 2 == 0) {
#pragma unroll
for (unsigned i = 0; i < TLC / 2; i++) { // Ensure valid indexing
out.limbs_storage.limbs64[i] = (xs.limbs_storage.limbs64[i + TLC / 2] << 2 * slack_bits) |
(xs.limbs_storage.limbs64[i + TLC / 2 - 1] >> (64 - 2 * slack_bits));
}
} else {
#pragma unroll
for (unsigned i = 0; i < TLC; i++) { // Ensure valid indexing
out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
(xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
}
}
#endif

return out;
}

Expand Down Expand Up @@ -440,7 +439,7 @@ class Field
}

static DEVICE_INLINE uint32_t
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, const uint32_t* extra, size_t n = (TLC >> 1))
{
acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);

Expand Down Expand Up @@ -505,12 +504,12 @@ class Field
* limb products are included.
*/
static DEVICE_INLINE void
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
{
ff_storage bs = get_neg_modulus();
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* c = cs.limbs;
const uint32_t* c = cs.limbs;
uint32_t* even = rs.limbs;

if constexpr (TLC > 2) {
Expand Down Expand Up @@ -674,15 +673,15 @@ class Field
}

static HOST_DEVICE_INLINE void
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
{
#ifdef __CUDA_ARCH__
return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
#else
Wide r_wide = {};
host_math::template multiply_raw<TLC>(as, get_neg_modulus(), r_wide.limbs_storage);
Field r = Wide::get_lower(r_wide);
add_limbs<TLC, false>(cs, r.limbs_storage, rs);
const Field& r_low_view = r_wide.get_lower_view();
add_limbs<TLC, false>(cs, r_low_view.limbs_storage, rs);
#endif
}

Expand Down Expand Up @@ -806,16 +805,17 @@ class Field
Field xs_hi = Wide::get_higher_with_slack(xs);
Wide l = {};
multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m`
Field l_hi = Wide::get_higher(l);
// Note: taking views is zero copy but unsafe
const Field& l_hi = l.get_higher_view();
const Field& xs_lo = xs.get_lower_view();
Field r = {};
Field xs_lo = Wide::get_lower(xs);
// Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
// `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage);
ff_storage r_reduced = {};
uint32_t carry = 0;
// As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
if (num_of_reductions() == 2) {
if constexpr (num_of_reductions() == 2) {
carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<2>(), r_reduced);
if (carry == 0) r = Field{r_reduced};
}
Expand All @@ -827,6 +827,7 @@ class Field

HOST_DEVICE Field& operator=(Field const& other)
{
#pragma unroll
for (int i = 0; i < TLC; i++) {
this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
}
Expand All @@ -850,6 +851,7 @@ class Field
limbs_or |= x[i] ^ y[i];
return limbs_or == 0;
#else
#pragma unroll
for (unsigned i = 0; i < TLC; i++)
if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false;
return true;
Expand All @@ -861,16 +863,18 @@ class Field
template <const Field& multiplier>
static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
{
Field mul = multiplier;
static bool is_u32 = true;
#ifdef __CUDA_ARCH__
UNROLL
#endif
for (unsigned i = 1; i < TLC; i++)
is_u32 &= (mul.limbs_storage.limbs[i] == 0);

if (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
return mul * xs;
constexpr bool is_u32 = []() {
bool is_u32 = true;
for (unsigned i = 1; i < TLC; i++)
LeonHibnik marked this conversation as resolved.
Show resolved Hide resolved
is_u32 &= (multiplier.limbs_storage.limbs[i] == 0);
return is_u32;
}();

if constexpr (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);

// This is not really a copy but required for CUDA compilation since the template param is not in the device memory
Field mult = multiplier;
return mult * xs;
}

template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
Expand All @@ -881,6 +885,8 @@ class Field
bool is_zero = true;
#ifdef __CUDA_ARCH__
UNROLL
#else
#pragma unroll
#endif
for (unsigned i = 0; i < 32; i++) {
if (multiplier & (1 << i)) {
Expand Down
10 changes: 10 additions & 0 deletions icicle/include/icicle/fields/host_math.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ namespace host_math {
{
uint32_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
#pragma unroll
for (unsigned i = 0; i < NLIMBS; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
Expand All @@ -142,6 +143,7 @@ namespace host_math {
{
uint64_t carry = 0;
carry_chain<NLIMBS, false, CARRY_OUT> chain;
#pragma unroll
for (unsigned i = 0; i < NLIMBS / 2; i++)
r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
return CARRY_OUT ? carry : 0;
Expand Down Expand Up @@ -178,8 +180,10 @@ namespace host_math {
const uint32_t* a = as.limbs;
const uint32_t* b = bs.limbs;
uint32_t* r = rs.limbs;
#pragma unroll
for (unsigned i = 0; i < NLIMBS_B; i++) {
uint32_t carry = 0;
#pragma unroll
for (unsigned j = 0; j < NLIMBS_A; j++)
r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
r[NLIMBS_A + i] = carry;
Expand All @@ -189,8 +193,10 @@ namespace host_math {
template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r)
{
#pragma unroll
for (unsigned i = 0; i < NLIMBS_B / 2; i++) {
uint64_t carry = 0;
#pragma unroll
for (unsigned j = 0; j < NLIMBS_A / 2; j++)
r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry);
r[NLIMBS_A / 2 + i] = carry;
Expand Down Expand Up @@ -247,6 +253,7 @@ namespace host_math {
storage<NLIMBS> out{};
if constexpr (LIMBS_GAP < NLIMBS) {
out.limbs[LIMBS_GAP] = xs.limbs[0] << BITS32;
#pragma unroll
for (unsigned i = 1; i < NLIMBS - LIMBS_GAP; i++)
out.limbs[i + LIMBS_GAP] = (xs.limbs[i] << BITS32) + (xs.limbs[i - 1] >> (32 - BITS32));
}
Expand All @@ -264,6 +271,7 @@ namespace host_math {
constexpr unsigned LIMBS_GAP = BITS / 32;
storage<NLIMBS> out{};
if constexpr (LIMBS_GAP < NLIMBS - 1) {
#pragma unroll
for (unsigned i = 0; i < NLIMBS - LIMBS_GAP - 1; i++)
out.limbs[i] = (xs.limbs[i + LIMBS_GAP] >> BITS32) + (xs.limbs[i + LIMBS_GAP + 1] << (32 - BITS32));
}
Expand All @@ -281,7 +289,9 @@ namespace host_math {
const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
{
storage<NLIMBS_DENOM> temp = {};
#pragma unroll
for (int limb_idx = NLIMBS_NUM - 1; limb_idx >= 0; limb_idx--) {
#pragma unroll
for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
r = left_shift<NLIMBS_DENOM, 1>(r);
r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
Expand Down
9 changes: 4 additions & 5 deletions icicle/include/icicle/utils/modifiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@
#define DEVICE_INLINE __device__ INLINE_MACRO
#define HOST_DEVICE __host__ __device__
#define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO
#else // not CUDA
#define INLINE_MACRO
#else // not NVCC
#define UNROLL
#define HOST_INLINE
#define DEVICE_INLINE
#define HOST_DEVICE
#define HOST_DEVICE_INLINE
#define HOST_INLINE __attribute__((always_inline))
#define DEVICE_INLINE
#define HOST_DEVICE_INLINE HOST_INLINE
#define __host__
#define __device__
#endif
Expand Down
Loading