ingonyama-zk · yshekel · Aug 26, 2024 · Aug 26, 2024
diff --git a/icicle/include/icicle/fields/host_math.h b/icicle/include/icicle/fields/host_math.h
@@ -1,56 +1,75 @@
+// Note: this optimization generates invalid code (using gcc) when storage class has a union for both u32 and u64 so disabling it.
+#if defined(__GNUC__) && !defined(__NVCC__) && !defined(__clang__)
+  #pragma GCC optimize("no-strict-aliasing")
+#endif
+
 #pragma once
 
 #include <cstdint>
 #include "icicle/utils/modifiers.h"
 #include "icicle/fields/storage.h"
 namespace host_math {
 
-  // return x + y with uint32_t operands
-  static constexpr __host__ uint32_t add(const uint32_t x, const uint32_t y) { return x + y; }
+  // return x + y with T operands
+  template <typename T>
+  static constexpr __host__ T add(const T x, const T y)
+  {
+    return x + y;
+  }
 
-  // return x + y + carry with uint32_t operands
-  static constexpr __host__ uint32_t addc(const uint32_t x, const uint32_t y, const uint32_t carry)
+  // return x + y + carry with T operands
+  template <typename T>
+  static constexpr __host__ T addc(const T x, const T y, const T carry)
   {
     return x + y + carry;
   }
 
-  // return x + y and carry out with uint32_t operands
-  static constexpr __host__ uint32_t add_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  // return x + y and carry out with T operands
+  template <typename T>
+  static constexpr __host__ T add_cc(const T x, const T y, T& carry)
   {
-    uint32_t result = x + y;
+    T result = x + y;
     carry = x > result;
     return result;
   }
 
-  // return x + y + carry and carry out  with uint32_t operands
-  static constexpr __host__ uint32_t addc_cc(const uint32_t x, const uint32_t y, uint32_t& carry)
+  // return x + y + carry and carry out  with T operands
+  template <typename T>
+  static constexpr __host__ T addc_cc(const T x, const T y, T& carry)
   {
-    const uint32_t result = x + y + carry;
+    const T result = x + y + carry;
     carry = carry && x >= result || !carry && x > result;
     return result;
   }
 
-  // return x - y with uint32_t operands
-  static constexpr __host__ uint32_t sub(const uint32_t x, const uint32_t y) { return x - y; }
+  // return x - y with T operands
+  template <typename T>
+  static constexpr __host__ T sub(const T x, const T y)
+  {
+    return x - y;
+  }
 
-  //    return x - y - borrow with uint32_t operands
-  static constexpr __host__ uint32_t subc(const uint32_t x, const uint32_t y, const uint32_t borrow)
+  //    return x - y - borrow with T operands
+  template <typename T>
+  static constexpr __host__ T subc(const T x, const T y, const T borrow)
   {
     return x - y - borrow;
   }
 
-  //    return x - y and borrow out with uint32_t operands
-  static constexpr __host__ uint32_t sub_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  //    return x - y and borrow out with T operands
+  template <typename T>
+  static constexpr __host__ T sub_cc(const T x, const T y, T& borrow)
   {
-    uint32_t result = x - y;
+    T result = x - y;
     borrow = x < result;
     return result;
   }
 
-  //    return x - y - borrow and borrow out with uint32_t operands
-  static constexpr __host__ uint32_t subc_cc(const uint32_t x, const uint32_t y, uint32_t& borrow)
+  //    return x - y - borrow and borrow out with T operands
+  template <typename T>
+  static constexpr __host__ T subc_cc(const T x, const T y, T& borrow)
   {
-    const uint32_t result = x - y - borrow;
+    const T result = x - y - borrow;
     borrow = borrow && x <= result || !borrow && x < result;
     return result;
   }
@@ -64,13 +83,22 @@ namespace host_math {
     return result;
   }
 
+  static constexpr __host__ uint64_t madc_cc_64(const uint64_t x, const uint64_t y, const uint64_t z, uint64_t& carry)
+  {
+    __uint128_t r = static_cast<__uint128_t>(x) * y + z + carry;
+    carry = (uint64_t)(r >> 64);
+    uint64_t result = r & 0xffffffffffffffff;
+    return result;
+  }
+
   template <unsigned OPS_COUNT = UINT32_MAX, bool CARRY_IN = false, bool CARRY_OUT = false>
   struct carry_chain {
     unsigned index;
 
     constexpr HOST_INLINE carry_chain() : index(0) {}
 
-    constexpr HOST_INLINE uint32_t add(const uint32_t x, const uint32_t y, uint32_t& carry)
+    template <typename T>
+    constexpr HOST_INLINE T add(const T x, const T y, T& carry)
     {
       index++;
       if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
@@ -83,7 +111,8 @@ namespace host_math {
         return host_math::addc(x, y, carry);
     }
 
-    constexpr HOST_INLINE uint32_t sub(const uint32_t x, const uint32_t y, uint32_t& carry)
+    template <typename T>
+    constexpr HOST_INLINE T sub(const T x, const T y, T& carry)
     {
       index++;
       if (index == 1 && OPS_COUNT == 1 && !CARRY_IN && !CARRY_OUT)
@@ -97,9 +126,55 @@ namespace host_math {
     }
   };
 
+  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
+  static constexpr HOST_INLINE uint32_t add_sub_limbs_32(const uint32_t* x, const uint32_t* y, uint32_t* r)
+  {
+    uint32_t carry = 0;
+    carry_chain<NLIMBS, false, CARRY_OUT> chain;
+    for (unsigned i = 0; i < NLIMBS; i++)
+      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
+    return CARRY_OUT ? carry : 0;
+  }
+
+  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
+  static constexpr HOST_INLINE uint64_t add_sub_limbs_64(const uint64_t* x, const uint64_t* y, uint64_t* r)
+  {
+    uint64_t carry = 0;
+    carry_chain<NLIMBS, false, CARRY_OUT> chain;
+    for (unsigned i = 0; i < NLIMBS / 2; i++)
+      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
+    return CARRY_OUT ? carry : 0;
+  }
+
+  template <
+    unsigned NLIMBS,
+    bool SUBTRACT,
+    bool CARRY_OUT,
+    bool USE_32 = true> // for now we use only the 32 add/sub because the casting of the carry causes problems when
+                        // compiling in release. to solve this we need to entirely split the field functions between a
+                        // host version and a device version.
+  static constexpr HOST_INLINE uint32_t // 32 is enough for the carry
+  add_sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
+  {
+    if constexpr (USE_32 || NLIMBS < 2) {
+      const uint32_t* x = xs.limbs;
+      const uint32_t* y = ys.limbs;
+      uint32_t* r = rs.limbs;
+      return add_sub_limbs_32<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
+    } else {
+      const uint64_t* x = xs.limbs64;
+      const uint64_t* y = ys.limbs64;
+      uint64_t* r = rs.limbs64;
+      // Note: returns uint64 but uint 32 is enough.
+      uint64_t result = add_sub_limbs_64<NLIMBS, SUBTRACT, CARRY_OUT>(x, y, r);
+      uint32_t carry = result == 1;
+      return carry;
+    }
+  }
+
   template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
   static constexpr HOST_INLINE void
-  multiply_raw(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
+  multiply_raw_32(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
   {
     const uint32_t* a = as.limbs;
     const uint32_t* b = bs.limbs;
@@ -112,18 +187,54 @@ namespace host_math {
     }
   }
 
-  template <unsigned NLIMBS, bool SUBTRACT, bool CARRY_OUT>
-  static constexpr HOST_INLINE uint32_t
-  add_sub_limbs(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys, storage<NLIMBS>& rs)
+  template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
+  static HOST_INLINE void multiply_raw_64(const uint64_t* a, const uint64_t* b, uint64_t* r)
   {
-    const uint32_t* x = xs.limbs;
-    const uint32_t* y = ys.limbs;
-    uint32_t* r = rs.limbs;
-    uint32_t carry = 0;
-    carry_chain<NLIMBS, false, CARRY_OUT> chain;
-    for (unsigned i = 0; i < NLIMBS; i++)
-      r[i] = SUBTRACT ? chain.sub(x[i], y[i], carry) : chain.add(x[i], y[i], carry);
-    return CARRY_OUT ? carry : 0;
+    for (unsigned i = 0; i < NLIMBS_B / 2; i++) {
+      uint64_t carry = 0;
+      for (unsigned j = 0; j < NLIMBS_A / 2; j++)
+        r[j + i] = host_math::madc_cc_64(a[j], b[i], r[j + i], carry);
+      r[NLIMBS_A / 2 + i] = carry;
+    }
+  }
+
+  template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A>
+  static HOST_INLINE void
+  multiply_raw_64(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
+  {
+    const uint64_t* a = as.limbs64;
+    const uint64_t* b = bs.limbs64;
+    uint64_t* r = rs.limbs64;
+    multiply_raw_64<NLIMBS_A, NLIMBS_B>(a, b, r);
+  }
+
+  template <unsigned NLIMBS_A, unsigned NLIMBS_B = NLIMBS_A, bool USE_32 = false>
+  static constexpr HOST_INLINE void
+  multiply_raw(const storage<NLIMBS_A>& as, const storage<NLIMBS_B>& bs, storage<NLIMBS_A + NLIMBS_B>& rs)
+  {
+    static_assert(
+      (NLIMBS_A % 2 == 0 || NLIMBS_A == 1) && (NLIMBS_B % 2 == 0 || NLIMBS_B == 1),
+      "odd number of limbs is not supported\n");
+    if constexpr (USE_32) {
+      multiply_raw_32<NLIMBS_A, NLIMBS_B>(as, bs, rs);
+      return;
+    } else if constexpr ((NLIMBS_A == 1 && NLIMBS_B == 2) || (NLIMBS_A == 2 && NLIMBS_B == 1)) {
+      multiply_raw_32<NLIMBS_A, NLIMBS_B>(as, bs, rs);
+      return;
+    } else if constexpr (NLIMBS_A == 1 && NLIMBS_B == 1) {
+      rs.limbs[1] = 0;
+      rs.limbs[0] = host_math::madc_cc(as.limbs[0], bs.limbs[0], 0, rs.limbs[1]);
+      return;
+    } else if constexpr (NLIMBS_A == 2 && NLIMBS_B == 2) {
+      const uint64_t* a = as.limbs64; // nof limbs should be even
+      const uint64_t* b = bs.limbs64;
+      uint64_t* r = rs.limbs64;
+      r[1] = 0;
+      r[0] = host_math::madc_cc_64(a[0], b[0], 0, r[1]);
+      return;
+    } else {
+      multiply_raw_64<NLIMBS_A, NLIMBS_B>(as, bs, rs);
+    }
   }
 
   template <unsigned NLIMBS, unsigned BITS>
@@ -162,7 +273,11 @@ namespace host_math {
     }
   }
 
-  template <unsigned NLIMBS_NUM, unsigned NLIMBS_DENOM, unsigned NLIMBS_Q = (NLIMBS_NUM - NLIMBS_DENOM)>
+  template <
+    unsigned NLIMBS_NUM,
+    unsigned NLIMBS_DENOM,
+    unsigned NLIMBS_Q = (NLIMBS_NUM - NLIMBS_DENOM),
+    bool USE_32 = false>
   static constexpr HOST_INLINE void integer_division(
     const storage<NLIMBS_NUM>& num, const storage<NLIMBS_DENOM>& denom, storage<NLIMBS_Q>& q, storage<NLIMBS_DENOM>& r)
   {
@@ -171,12 +286,16 @@ namespace host_math {
       for (int bit_idx = 31; bit_idx >= 0; bit_idx--) {
         r = left_shift<NLIMBS_DENOM, 1>(r);
         r.limbs[0] |= ((num.limbs[limb_idx] >> bit_idx) & 1);
-        uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true>(r, denom, temp);
-        if ((limb_idx < NLIMBS_Q) & !c) {
+        uint32_t c = add_sub_limbs<NLIMBS_DENOM, true, true, USE_32>(r, denom, temp);
+        if (limb_idx < NLIMBS_Q & !c) {
           r = temp;
           q.limbs[limb_idx] |= 1 << bit_idx;
         }
       }
     }
   }
 } // namespace host_math
+
+#if defined(__GNUC__) && !defined(__NVCC__) && !defined(__clang__)
+  #pragma GCC reset_options
+#endif
diff --git a/icicle/include/icicle/fields/params_gen.h b/icicle/include/icicle/fields/params_gen.h
@@ -8,7 +8,7 @@ namespace params_gen {
   static constexpr HOST_INLINE storage<2 * NLIMBS> get_square(const storage<NLIMBS>& xs)
   {
     storage<2 * NLIMBS> rs = {};
-    host_math::template multiply_raw<NLIMBS>(xs, xs, rs);
+    host_math::template multiply_raw<NLIMBS, NLIMBS, true>(xs, xs, rs);
     return host_math::template left_shift<2 * NLIMBS, BIT_SHIFT>(rs);
   }
 
@@ -17,7 +17,7 @@ namespace params_gen {
   get_difference_no_carry(const storage<NLIMBS>& xs, const storage<NLIMBS>& ys)
   {
     storage<NLIMBS> rs = {};
-    host_math::template add_sub_limbs<NLIMBS, true, false>(xs, ys, rs);
+    host_math::template add_sub_limbs<NLIMBS, true, false, true>(xs, ys, rs);
     return rs;
   }
 
@@ -28,7 +28,7 @@ namespace params_gen {
     storage<NLIMBS> qs = {};
     storage<2 * NLIMBS> wide_one = {1};
     storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, EXP>(wide_one);
-    host_math::template integer_division<2 * NLIMBS, NLIMBS>(pow_of_2, modulus, qs, rs);
+    host_math::template integer_division<2 * NLIMBS, NLIMBS, NLIMBS, true>(pow_of_2, modulus, qs, rs);
     return qs;
   }
 
@@ -38,12 +38,12 @@ namespace params_gen {
     storage<NLIMBS> rs = {1};
     for (int i = 0; i < 32 * NLIMBS; i++) {
       if (INV) {
-        if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
+        if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false, true>(rs, modulus, rs);
         rs = host_math::template right_shift<NLIMBS, 1>(rs);
       } else {
         rs = host_math::template left_shift<NLIMBS, 1>(rs);
         storage<NLIMBS> temp = {};
-        rs = host_math::template add_sub_limbs<NLIMBS, true, true>(rs, modulus, temp) ? rs : temp;
+        rs = host_math::template add_sub_limbs<NLIMBS, true, true, true>(rs, modulus, temp) ? rs : temp;
       }
     }
     return rs;
@@ -57,12 +57,12 @@ namespace params_gen {
     storage<2 * NLIMBS> x1 = {};
     storage<3 * NLIMBS> x2 = {};
     storage<3 * NLIMBS> x3 = {};
-    host_math::template multiply_raw<NLIMBS>(modulus, m, x1);
-    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, x1, x2);
+    host_math::template multiply_raw<NLIMBS, NLIMBS, true>(modulus, m, x1);
+    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS, true>(modulus, x1, x2);
     storage<2 * NLIMBS> one = {1};
     storage<2 * NLIMBS> pow_of_2 = host_math::template left_shift<2 * NLIMBS, NBITS>(one);
-    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS>(modulus, pow_of_2, x3);
-    host_math::template add_sub_limbs<3 * NLIMBS, true, false>(x3, x2, x2);
+    host_math::template multiply_raw<NLIMBS, 2 * NLIMBS, true>(modulus, pow_of_2, x3);
+    host_math::template add_sub_limbs<3 * NLIMBS, true, false, true>(x3, x2, x2);
     double err = (double)x2.limbs[2 * NLIMBS - 1] / pow_of_2.limbs[2 * NLIMBS - 1];
     err += (double)m.limbs[NLIMBS - 1] / 0xffffffff;
     err += (double)NLIMBS / 0x80000000;
@@ -87,7 +87,7 @@ namespace params_gen {
     storage_array<TWO_ADICITY, NLIMBS> invs = {};
     storage<NLIMBS> rs = {1};
     for (int i = 0; i < TWO_ADICITY; i++) {
-      if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false>(rs, modulus, rs);
+      if (rs.limbs[0] & 1) host_math::template add_sub_limbs<NLIMBS, false, false, true>(rs, modulus, rs);
       rs = host_math::template right_shift<NLIMBS, 1>(rs);
       invs.storages[i] = rs;
     }