performance: (~3x) optimized field multiplier and EC-adder for improv…

…ed MSM and ECNTT (#693) - fix: ECNTT 5<logn<14 was single threaded. Now multi threaded. - This PR is using inlining and loop unrolling to improve CPU for: scalar multiplication, scalar * ECpoint. EC addition and consequently ECNTT and and MSM.
ingonyama-zk · Dec 12, 2024 · bef4371 · bef4371
1 parent da16082
commit bef4371
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 54 deletions.
diff --git a/icicle/backend/cpu/include/ntt_cpu.h b/icicle/backend/cpu/include/ntt_cpu.h
@@ -3,10 +3,17 @@
 #include "icicle/utils/log.h"
 #include "ntt_tasks_manager.h"
 #include "ntt_utils.h"
-// #include <_types/_uint32_t.h>
 #include <cstdint>
 #include <deque>
 
+#ifdef CURVE_ID
+  #include "icicle/curves/curve_config.h"
+using namespace curve_config;
+  #define IS_ECNTT std::is_same_v<E, projective_t>
+#else
+  #define IS_ECNTT false
+#endif
+
 using namespace field_config;
 using namespace icicle;
 
@@ -464,10 +471,15 @@ namespace ntt_cpu {
   bool NttCpu<S, E>::compute_if_is_parallel(uint32_t logn, const NTTConfig<S>& config)
   {
     uint32_t log_batch_size = uint32_t(log2(config.batch_size));
-    uint32_t scalar_size = sizeof(S);
-    // for small scalars, the threshold for when it is faster to use parallel NTT is higher
-    if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
-      return false;
+    // For ecntt we want parallelism unless really small case
+    if constexpr (IS_ECNTT) {
+      return logn > 5;
+    } else {
+      uint32_t scalar_size = sizeof(S);
+      // for small scalars, the threshold for when it is faster to use parallel NTT is higher
+      if ((scalar_size >= 32 && (logn + log_batch_size) <= 13) || (scalar_size < 32 && (logn + log_batch_size) <= 16)) {
+        return false;
+      }
     }
     return true;
   }

diff --git a/icicle/include/icicle/curves/projective.h b/icicle/include/icicle/curves/projective.h
@@ -191,7 +191,7 @@ class Projective
 
     Projective res = zero();
 
-    const int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
+    constexpr int nof_windows = (SCALAR_FF::NBITS + window_size - 1) / window_size;
     bool res_is_not_zero = false;
     for (int w = nof_windows - 1; w >= 0; w -= 1) {
       // Extract the next window_size bits from the scalar

diff --git a/icicle/include/icicle/fields/field.h b/icicle/include/icicle/fields/field.h
@@ -160,49 +160,48 @@ class Field
       Wide out{};
 #ifdef __CUDA_ARCH__
       UNROLL
+#else
+  #pragma unroll
 #endif
       for (unsigned i = 0; i < TLC; i++)
         out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
       return out;
     }
 
-    static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide& xs)
-    {
-      Field out{};
-#ifdef __CUDA_ARCH__
-      UNROLL
-#endif
-      for (unsigned i = 0; i < TLC; i++)
-        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
-      return out;
-    }
-
-    static constexpr Field HOST_DEVICE_INLINE get_higher(const Wide& xs)
+    // WARNING: taking views is zero copy but unsafe
+    constexpr const Field& get_lower_view() const { return *reinterpret_cast<const Field*>(limbs_storage.limbs); }
+    constexpr const Field& get_higher_view() const
     {
-      Field out{};
-#ifdef __CUDA_ARCH__
-      UNROLL
-#endif
-      for (unsigned i = 0; i < TLC; i++)
-        out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i + TLC];
-      return out;
+      return *reinterpret_cast<const Field*>(limbs_storage.limbs + TLC);
     }
 
+    // This is not zero copy
     static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide& xs)
     {
       Field out{};
 #ifdef __CUDA_ARCH__
       UNROLL
-#endif
       for (unsigned i = 0; i < TLC; i++) {
-#ifdef __CUDA_ARCH__
         out.limbs_storage.limbs[i] =
           __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], 2 * slack_bits);
+      }
 #else
-        out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
-                                     (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
-#endif
+      // CPU: for even number of limbs, read and shift 64b limbs, otherwise 32b
+      if constexpr (TLC % 2 == 0) {
+  #pragma unroll
+        for (unsigned i = 0; i < TLC / 2; i++) { // Ensure valid indexing
+          out.limbs_storage.limbs64[i] = (xs.limbs_storage.limbs64[i + TLC / 2] << 2 * slack_bits) |
+                                         (xs.limbs_storage.limbs64[i + TLC / 2 - 1] >> (64 - 2 * slack_bits));
+        }
+      } else {
+  #pragma unroll
+        for (unsigned i = 0; i < TLC; i++) { // Ensure valid indexing
+          out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << 2 * slack_bits) +
+                                       (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - 2 * slack_bits));
+        }
       }
+#endif
+
       return out;
     }
 
@@ -440,7 +439,7 @@ class Field
   }
 
   static DEVICE_INLINE uint32_t
-  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, uint32_t* extra, size_t n = (TLC >> 1))
+  mul_n_and_add(uint32_t* acc, const uint32_t* a, uint32_t bi, const uint32_t* extra, size_t n = (TLC >> 1))
   {
     acc[0] = ptx::mad_lo_cc(a[0], bi, extra[0]);
 
@@ -505,12 +504,12 @@ class Field
    * limb products are included.
    */
   static DEVICE_INLINE void
-  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, ff_storage& cs, ff_storage& rs)
+  multiply_and_add_lsb_neg_modulus_raw_device(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
   {
     ff_storage bs = get_neg_modulus();
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
-    uint32_t* c = cs.limbs;
+    const uint32_t* c = cs.limbs;
     uint32_t* even = rs.limbs;
 
     if constexpr (TLC > 2) {
@@ -674,15 +673,15 @@ class Field
   }
 
   static HOST_DEVICE_INLINE void
-  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, ff_storage& cs, ff_storage& rs)
+  multiply_and_add_lsb_neg_modulus_raw(const ff_storage& as, const ff_storage& cs, ff_storage& rs)
   {
 #ifdef __CUDA_ARCH__
     return multiply_and_add_lsb_neg_modulus_raw_device(as, cs, rs);
 #else
     Wide r_wide = {};
     host_math::template multiply_raw<TLC>(as, get_neg_modulus(), r_wide.limbs_storage);
-    Field r = Wide::get_lower(r_wide);
-    add_limbs<TLC, false>(cs, r.limbs_storage, rs);
+    const Field& r_low_view = r_wide.get_lower_view();
+    add_limbs<TLC, false>(cs, r_low_view.limbs_storage, rs);
 #endif
   }
 
@@ -806,16 +805,17 @@ class Field
     Field xs_hi = Wide::get_higher_with_slack(xs);
     Wide l = {};
     multiply_msb_raw(xs_hi.limbs_storage, get_m(), l.limbs_storage); // MSB mult by `m`
-    Field l_hi = Wide::get_higher(l);
+    // Note: taking views is zero copy but unsafe
+    const Field& l_hi = l.get_higher_view();
+    const Field& xs_lo = xs.get_lower_view();
     Field r = {};
-    Field xs_lo = Wide::get_lower(xs);
     // Here we need to compute the lsb of `xs - l \cdot p` and to make use of fused multiply-and-add, we rewrite it as
     // `xs + l \cdot (2^{32 \cdot TLC}-p)` which is the same as original (up to higher limbs which we don't care about).
     multiply_and_add_lsb_neg_modulus_raw(l_hi.limbs_storage, xs_lo.limbs_storage, r.limbs_storage);
     ff_storage r_reduced = {};
     uint32_t carry = 0;
     // As mentioned, either 2 or 1 reduction can be performed depending on the field in question.
-    if (num_of_reductions() == 2) {
+    if constexpr (num_of_reductions() == 2) {
       carry = sub_limbs<TLC, true>(r.limbs_storage, get_modulus<2>(), r_reduced);
       if (carry == 0) r = Field{r_reduced};
     }
@@ -827,6 +827,7 @@ class Field
 
   HOST_DEVICE Field& operator=(Field const& other)
   {
+#pragma unroll
     for (int i = 0; i < TLC; i++) {
       this->limbs_storage.limbs[i] = other.limbs_storage.limbs[i];
     }
@@ -850,6 +851,7 @@ class Field
       limbs_or |= x[i] ^ y[i];
     return limbs_or == 0;
 #else
+  #pragma unroll
     for (unsigned i = 0; i < TLC; i++)
       if (xs.limbs_storage.limbs[i] != ys.limbs_storage.limbs[i]) return false;
     return true;
@@ -861,16 +863,18 @@ class Field
   template <const Field& multiplier>
   static HOST_DEVICE_INLINE Field mul_const(const Field& xs)
   {
-    Field mul = multiplier;
-    static bool is_u32 = true;
-#ifdef __CUDA_ARCH__
-    UNROLL
-#endif
-    for (unsigned i = 1; i < TLC; i++)
-      is_u32 &= (mul.limbs_storage.limbs[i] == 0);
-
-    if (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
-    return mul * xs;
+    constexpr bool is_u32 = []() {
+      bool is_u32 = true;
+      for (unsigned i = 1; i < TLC; i++)
+        is_u32 &= (multiplier.limbs_storage.limbs[i] == 0);
+      return is_u32;
+    }();
+
+    if constexpr (is_u32) return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
+
+    // This is not really a copy but required for CUDA compilation since the template param is not in the device memory
+    Field mult = multiplier;
+    return mult * xs;
   }
 
   template <uint32_t multiplier, class T, unsigned REDUCTION_SIZE = 1>
@@ -881,6 +885,8 @@ class Field
     bool is_zero = true;
 #ifdef __CUDA_ARCH__
     UNROLL
+#else
+  #pragma unroll
 #endif
     for (unsigned i = 0; i < 32; i++) {
       if (multiplier & (1 << i)) {

diff --git a/icicle/include/icicle/fields/host_math.h b/icicle/include/icicle/fields/host_math.h
@@ -132,6 +132,7 @@ namespace host_math {
   {
     uint32_t carry = 0;
     carry_chain<NLIMBS, false, CARRY_OUT> chain;
+#pragma unroll
     for (unsigned i = 0; i < NLIMBS; i++)
       r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
     return CARRY_OUT ? carry : 0;
@@ -142,6 +143,7 @@ namespace host_math {
   {
     uint64_t carry = 0;
     carry_chain<NLIMBS, false, CARRY_OUT> chain;
+#pragma unroll
     for (unsigned i = 0; i < NLIMBS / 2; i++)
       r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
     return CARRY_OUT ? carry : 0;
@@ -178,8 +180,10 @@ namespace host_math {
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
     uint32_t* r = rs.limbs;
+#pragma unroll
     for (unsigned i = 0; i < NLIMBS_B; i++) {
       uint32_t carry = 0;
+#pragma unroll
       for (unsigned j = 0; j < NLIMBS_A; j++)
         r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
       r[NLIMBS_A + i] = carry;
@@ -189,8 +193,10 @@ namespace host_math {
   template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
   static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r)
   {
+#pragma unroll
     for (unsigned i = 0; i < NLIMBS_B / 2; i++) {
       uint64_t carry = 0;
+#pragma unroll
       for (unsigned j = 0; j < NLIMBS_A / 2; j++)
         r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry);
       r[NLIMBS_A / 2 + i] = carry;
@@ -247,6 +253,7 @@ namespace host_math {
       storage<NLIMBS> out{};
       if constexpr (LIMBS_GAP < NLIMBS) {
         out.limbs[LIMBS_GAP] = xs.limbs[0] << BITS32;
+#pragma unroll
         for (unsigned i = 1; i < NLIMBS - LIMBS_GAP; i++)
           out.limbs[i + LIMBS_GAP] = (xs.limbs[i] << BITS32) + (xs.limbs[i - 1] >> (32 - BITS32));
       }
@@ -264,6 +271,7 @@ namespace host_math {
       constexpr unsigned LIMBS_GAP = BITS / 32;
       storage<NLIMBS> out{};
       if constexpr (LIMBS_GAP < NLIMBS - 1) {
+#pragma unroll
         for (unsigned i = 0; i < NLIMBS - LIMBS_GAP - 1; i++)
           out.limbs[i] = (xs.limbs[i + LIMBS_GAP] >> BITS32) + (xs.limbs[i + LIMBS_GAP + 1] << (32 - BITS32));
       }
@@ -281,7 +289,9 @@ namespace host_math {
     const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
   {
     storage<NLIMBS_DENOM> temp = {};
+#pragma unroll
     for (int limb_idx = NLIMBS_NUM - 1; limb_idx >= 0; limb_idx--) {
+#pragma unroll
       for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
         r = left_shift<NLIMBS_DENOM, 1>(r);
         r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);

diff --git a/icicle/include/icicle/utils/modifiers.h b/icicle/include/icicle/utils/modifiers.h
@@ -16,13 +16,12 @@
   #define DEVICE_INLINE      __device__ INLINE_MACRO
   #define HOST_DEVICE        __host__ __device__
   #define HOST_DEVICE_INLINE HOST_DEVICE INLINE_MACRO
-#else // not CUDA
-  #define INLINE_MACRO
+#else // not NVCC
   #define UNROLL
-  #define HOST_INLINE
-  #define DEVICE_INLINE
   #define HOST_DEVICE
-  #define HOST_DEVICE_INLINE
+  #define HOST_INLINE __attribute__((always_inline))
+  #define DEVICE_INLINE
+  #define HOST_DEVICE_INLINE HOST_INLINE
   #define __host__
   #define __device__
 #endif