diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h index a86c5b6ef..7d7e2d386 100644 --- a/icicle/backend/cpu/include/ntt_cpu.h +++ b/icicle/backend/cpu/include/ntt_cpu.h @@ -3,10 +3,17 @@ #include "icicle/utils/log.h" #include "ntt_tasks_manager.h" #include "ntt_utils.h" -// #include <_types/_uint32_t.h> #include #include +#ifdef CURVE_ID + #include "icicle/curves/curve_config.h" +using namespace curve_config; + #define IS_ECNTT std::is_same_v +#else + #define IS_ECNTT false +#endif + using namespace field_config; using namespace icicle; @@ -464,10 +471,15 @@ namespace ntt_cpu { bool NttCpu::compute_if_is_parallel(uint32_t logn, const NTTConfig& config) { uint32_t log_batch_size = uint32_t(log2(config.batch_size)); - uint32_t scalar_size = sizeof(S); - // for small scalars, the threshold for when it is faster to use parallel NTT is higher - if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) { - return false; + // For ecntt we want parallelism unless really small case + if constexpr (IS_ECNTT) { + return logn > 5; + } else { + uint32_t scalar_size = sizeof(S); + // for small scalars, the threshold for when it is faster to use parallel NTT is higher + if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) { + return false; + } } return true; } diff --git a/icicle/include/icicle/curves/projective.h b/icicle/include/icicle/curves/projective.h index 01d439c64..87fe0ab23 100644 --- a/icicle/include/icicle/curves/projective.h +++ b/icicle/include/icicle/curves/projective.h @@ -191,7 +191,7 @@ class Projective Projective res = zero(); - const int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size; + constexpr int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size; bool res_is_not_zero = false; for (int w = nof_windows - 1; w >= 0; w -= 1) { // Extract the next window_size bits from the scalar diff --git a/icicle/include/icicle/fields/field.h b/icicle/include/icicle/fields/field.h index 123b37baf..ccade81ad 100644 --- a/icicle/include/icicle/fields/field.h +++ b/icicle/include/icicle/fields/field.h @@ -160,49 +160,48 @@ class Field Wide out{}; #ifdef __CUDA_ARCH__ UNROLL +#else + #pragma unroll #endif for (unsigned i = 0; i < TLC; i++) out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i]; return out; } - static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs) - { - Field out{}; -#ifdef __CUDA_ARCH__ - UNROLL -#endif - for (unsigned i = 0; i < TLC; i++) - out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i]; - return out; - } - - static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs) + // WARNING: taking views is zero copy but unsafe + constexpr const Field& get_lower_view() const { return *reinterpret_cast(limbs_storage.limbs); } + constexpr const Field& get_higher_view() const { - Field out{}; -#ifdef __CUDA_ARCH__ - UNROLL -#endif - for (unsigned i = 0; i < TLC; i++) - out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC]; - return out; + return *reinterpret_cast(limbs_storage.limbs + TLC); } + // This is not zero copy static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs) { Field out{}; #ifdef __CUDA_ARCH__ UNROLL -#endif for (unsigned i = 0; i < TLC; i++) { -#ifdef __CUDA_ARCH__ out.limbs_storage.limbs[i] = __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits); + } #else - out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) + - (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits)); -#endif + // CPU: for even number of limbs, read and shift 64b limbs, otherwise 32b + if constexpr (TLC % 2 == 0) { + #pragma unroll + for (unsigned i = 0; i < TLC / 2; i++) { // Ensure valid indexing + out.limbs_storage.limbs64[i] = (xs.limbs_storage.limbs64[i + TLC / 2] << 2 * slack_bits) | + (xs.limbs_storage.limbs64[i + TLC / 2 - 1] >> (64 - 2 * slack_bits)); + } + } else { + #pragma unroll + for (unsigned i = 0; i < TLC; i++) { // Ensure valid indexing + out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) + + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits)); + } } +#endif + return out; } @@ -440,7 +439,7 @@ class Field } static DEVICE_INLINE uint32_t - mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1)) + mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, const uint32_t* extra, size_t n = (TLC >> 1)) { acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]); @@ -505,12 +504,12 @@ class Field * limb products are included. */ static DEVICE_INLINE void - multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs) + multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, const ff_storage& cs, ff_storage& rs) { ff_storage bs = get_neg_modulus(); const uint32_t* a = as.limbs; const uint32_t* b = bs.limbs; - uint32_t* c = cs.limbs; + const uint32_t* c = cs.limbs; uint32_t* even = rs.limbs; if constexpr (TLC > 2) { @@ -674,15 +673,15 @@ class Field } static HOST_DEVICE_INLINE void - multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs) + multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, const ff_storage& cs, ff_storage& rs) { #ifdef __CUDA_ARCH__ return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs); #else Wide r_wide = {}; host_math::template multiply_raw(as, get_neg_modulus(), r_wide.limbs_storage); - Field r = Wide::get_lower(r_wide); - add_limbs(cs, r.limbs_storage, rs); + const Field& r_low_view = r_wide.get_lower_view(); + add_limbs(cs, r_low_view.limbs_storage, rs); #endif } @@ -806,16 +805,17 @@ class Field Field xs_hi = Wide::get_higher_with_slack(xs); Wide l = {}; multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m` - Field l_hi = Wide::get_higher(l); + // Note: taking views is zero copy but unsafe + const Field& l_hi = l.get_higher_view(); + const Field& xs_lo = xs.get_lower_view(); Field r = {}; - Field xs_lo = Wide::get_lower(xs); // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about). multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage); ff_storage r_reduced = {}; uint32_t carry = 0; // As mentioned, either 2 or 1 reduction can be performed depending on the field in question. - if (num_of_reductions() == 2) { + if constexpr (num_of_reductions() == 2) { carry = sub_limbs(r.limbs_storage, get_modulus<2>(), r_reduced); if (carry == 0) r = Field{r_reduced}; } @@ -827,6 +827,7 @@ class Field HOST_DEVICE Field& operator=(Field const& other) { +#pragma unroll for (int i = 0; i < TLC; i++) { this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i]; } @@ -850,6 +851,7 @@ class Field limbs_or |= x[i] ^ y[i]; return limbs_or == 0; #else + #pragma unroll for (unsigned i = 0; i < TLC; i++) if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false; return true; @@ -861,16 +863,18 @@ class Field template static HOST_DEVICE_INLINE Field mul_const(const Field& xs) { - Field mul = multiplier; - static bool is_u32 = true; -#ifdef __CUDA_ARCH__ - UNROLL -#endif - for (unsigned i = 1; i < TLC; i++) - is_u32 &= (mul.limbs_storage.limbs[i] == 0); - - if (is_u32) return mul_unsigned(xs); - return mul * xs; + constexpr bool is_u32 = []() { + bool is_u32 = true; + for (unsigned i = 1; i < TLC; i++) + is_u32 &= (multiplier.limbs_storage.limbs[i] == 0); + return is_u32; + }(); + + if constexpr (is_u32) return mul_unsigned(xs); + + // This is not really a copy but required for CUDA compilation since the template param is not in the device memory + Field mult = multiplier; + return mult * xs; } template @@ -881,6 +885,8 @@ class Field bool is_zero = true; #ifdef __CUDA_ARCH__ UNROLL +#else + #pragma unroll #endif for (unsigned i = 0; i < 32; i++) { if (multiplier & (1 << i)) { diff --git a/icicle/include/icicle/fields/host_math.h b/icicle/include/icicle/fields/host_math.h index 90162caec..277a659d8 100644 --- a/icicle/include/icicle/fields/host_math.h +++ b/icicle/include/icicle/fields/host_math.h @@ -132,6 +132,7 @@ namespace host_math { { uint32_t carry = 0; carry_chain chain; +#pragma unroll for (unsigned i = 0; i < NLIMBS; i++) r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry); return CARRY_OUT ? carry : 0; @@ -142,6 +143,7 @@ namespace host_math { { uint64_t carry = 0; carry_chain chain; +#pragma unroll for (unsigned i = 0; i < NLIMBS / 2; i++) r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry); return CARRY_OUT ? carry : 0; @@ -178,8 +180,10 @@ namespace host_math { const uint32_t* a = as.limbs; const uint32_t* b = bs.limbs; uint32_t* r = rs.limbs; +#pragma unroll for (unsigned i = 0; i < NLIMBS_B; i++) { uint32_t carry = 0; +#pragma unroll for (unsigned j = 0; j < NLIMBS_A; j++) r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry); r[NLIMBS_A + i] = carry; @@ -189,8 +193,10 @@ namespace host_math { template static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r) { +#pragma unroll for (unsigned i = 0; i < NLIMBS_B / 2; i++) { uint64_t carry = 0; +#pragma unroll for (unsigned j = 0; j < NLIMBS_A / 2; j++) r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry); r[NLIMBS_A / 2 + i] = carry; @@ -247,6 +253,7 @@ namespace host_math { storage out{}; if constexpr (LIMBS_GAP < NLIMBS) { out.limbs[LIMBS_GAP] = xs.limbs[0] << BITS32; +#pragma unroll for (unsigned i = 1; i < NLIMBS - LIMBS_GAP; i++) out.limbs[i + LIMBS_GAP] = (xs.limbs[i] << BITS32) + (xs.limbs[i - 1] >> (32 - BITS32)); } @@ -264,6 +271,7 @@ namespace host_math { constexpr unsigned LIMBS_GAP = BITS / 32; storage out{}; if constexpr (LIMBS_GAP < NLIMBS - 1) { +#pragma unroll for (unsigned i = 0; i < NLIMBS - LIMBS_GAP - 1; i++) out.limbs[i] = (xs.limbs[i + LIMBS_GAP] >> BITS32) + (xs.limbs[i + LIMBS_GAP + 1] << (32 - BITS32)); } @@ -281,7 +289,9 @@ namespace host_math { const storage& num, const storage& denom, storage& q, storage& r) { storage temp = {}; +#pragma unroll for (int limb_idx = NLIMBS_NUM - 1; limb_idx >= 0; limb_idx--) { +#pragma unroll for (int bit_idx = 31; bit_idx >= 0; bit_idx--) { r = left_shift(r); r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1); diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h index b652e9829..5abf6781f 100644 --- a/icicle/include/icicle/utils/modifiers.h +++ b/icicle/include/icicle/utils/modifiers.h @@ -16,13 +16,12 @@ #define DEVICE_INLINE __device__ INLINE_MACRO #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO -#else // not CUDA - #define INLINE_MACRO +#else // not NVCC #define UNROLL - #define HOST_INLINE - #define DEVICE_INLINE #define HOST_DEVICE - #define HOST_DEVICE_INLINE + #define HOST_INLINE __attribute__((always_inline)) + #define DEVICE_INLINE + #define HOST_DEVICE_INLINE HOST_INLINE #define __host__ #define __device__ #endif