From 3aa67acb7c4787cd212b03cc44146b177347a578 Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Sat, 9 Nov 2024 11:37:31 -0500
Subject: [PATCH 1/6] move out peripheral changes from orion impl branch

---
 arith/gf2/src/gf2x64.rs                       | 301 ++++++++++++++++++
 arith/gf2/src/gf2x8.rs                        |  13 +-
 arith/gf2/src/lib.rs                          |   3 +
 arith/gf2/src/tests.rs                        |  22 +-
 arith/gf2_128/src/gf2_ext128/avx.rs           |  56 +++-
 arith/gf2_128/src/gf2_ext128/neon.rs          |  51 ++-
 arith/gf2_128/src/gf2_ext128x8/avx256.rs      |  14 +-
 arith/gf2_128/src/gf2_ext128x8/avx512.rs      |  16 +-
 arith/gf2_128/src/gf2_ext128x8/neon.rs        |  17 +-
 arith/gf2_128/src/tests.rs                    |   3 +-
 arith/mersenne31/src/m31_ext.rs               |   9 +
 arith/mersenne31/src/m31_ext3x16.rs           |  16 +-
 arith/mersenne31/src/m31x16/m31_avx256.rs     |   5 +-
 arith/mersenne31/src/m31x16/m31_avx512.rs     |   5 +-
 arith/mersenne31/src/m31x16/m31_neon.rs       |   5 +-
 arith/polynomials/src/mle.rs                  |   8 +-
 arith/src/bn254.rs                            |   5 +-
 arith/src/extension_field.rs                  |   6 +-
 arith/src/simd_field.rs                       |   5 +-
 config/src/gkr_config.rs                      |   2 +-
 transcript/src/fiat_shamir_hash.rs            |   6 +-
 transcript/src/fiat_shamir_hash/keccak_256.rs |   2 +-
 transcript/src/transcript.rs                  |  16 +-
 23 files changed, 520 insertions(+), 66 deletions(-)
 create mode 100644 arith/gf2/src/gf2x64.rs
diff --git a/arith/gf2/src/gf2x64.rs b/arith/gf2/src/gf2x64.rs
new file mode 100644
index 00000000..a03b594c
--- /dev/null
+++ b/arith/gf2/src/gf2x64.rs
@@ -0,0 +1,301 @@
+use std::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign};
+
+use arith::{Field, FieldSerde, FieldSerdeResult, SimdField};
+
+use super::GF2;
+
+#[derive(Debug, Clone, Copy, Default, PartialEq)]
+pub struct GF2x64 {
+    pub v: u64,
+}
+
+impl FieldSerde for GF2x64 {
+    const SERIALIZED_SIZE: usize = 8;
+
+    #[inline(always)]
+    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
+        writer.write_all(self.v.to_le_bytes().as_ref())?;
+        Ok(())
+    }
+
+    #[inline(always)]
+    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
+        let mut u = [0u8; Self::SERIALIZED_SIZE];
+        reader.read_exact(&mut u)?;
+        Ok(GF2x64 {
+            v: u64::from_le_bytes(u),
+        })
+    }
+
+    #[inline]
+    fn try_deserialize_from_ecc_format<R: std::io::Read>(_reader: R) -> FieldSerdeResult<Self> {
+        unimplemented!("We don't have serialization in ecc for gf2x64")
+    }
+}
+
+impl Field for GF2x64 {
+    const NAME: &'static str = "Galois Field 2 SIMD 64";
+
+    const SIZE: usize = 8;
+
+    const FIELD_SIZE: usize = 1;
+
+    const ZERO: Self = GF2x64 { v: 0 };
+
+    const ONE: Self = GF2x64 { v: !0u64 };
+
+    const INV_2: Self = GF2x64 { v: 0 }; // NOTE: should not be used
+
+    #[inline(always)]
+    fn zero() -> Self {
+        GF2x64::ZERO
+    }
+
+    #[inline(always)]
+    fn one() -> Self {
+        GF2x64::ONE
+    }
+
+    #[inline(always)]
+    fn is_zero(&self) -> bool {
+        self.v == 0
+    }
+
+    #[inline(always)]
+    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
+        GF2x64 { v: rng.next_u64() }
+    }
+
+    #[inline(always)]
+    fn random_bool(mut rng: impl rand::RngCore) -> Self {
+        GF2x64 { v: rng.next_u64() }
+    }
+
+    #[inline(always)]
+    fn exp(&self, exponent: u128) -> Self {
+        if exponent % 2 == 0 {
+            Self::one()
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn inv(&self) -> Option<Self> {
+        unimplemented!()
+    }
+
+    #[inline(always)]
+    fn as_u32_unchecked(&self) -> u32 {
+        self.v as u32
+    }
+
+    #[inline(always)]
+    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
+        let mut buf = [0u8; 8];
+        buf[..].copy_from_slice(&bytes[..8]);
+        GF2x64 {
+            v: u64::from_le_bytes(buf),
+        }
+    }
+
+    #[inline(always)]
+    fn mul_by_5(&self) -> Self {
+        *self
+    }
+
+    #[inline(always)]
+    fn mul_by_6(&self) -> Self {
+        Self::ZERO
+    }
+}
+
+impl Mul<&GF2x64> for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: &GF2x64) -> Self::Output {
+        GF2x64 { v: self.v & rhs.v }
+    }
+}
+
+impl Mul<GF2x64> for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: GF2x64) -> GF2x64 {
+        GF2x64 { v: self.v & rhs.v }
+    }
+}
+
+impl MulAssign<&GF2x64> for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: &GF2x64) {
+        self.v &= rhs.v;
+    }
+}
+
+impl MulAssign<GF2x64> for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: GF2x64) {
+        self.v &= rhs.v;
+    }
+}
+
+impl Sub for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: GF2x64) -> GF2x64 {
+        GF2x64 { v: self.v ^ rhs.v }
+    }
+}
+
+impl SubAssign for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: GF2x64) {
+        self.v ^= rhs.v;
+    }
+}
+
+impl Add for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: GF2x64) -> GF2x64 {
+        GF2x64 { v: self.v ^ rhs.v }
+    }
+}
+
+impl AddAssign for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: GF2x64) {
+        self.v ^= rhs.v;
+    }
+}
+
+impl Add<&GF2x64> for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: &GF2x64) -> GF2x64 {
+        GF2x64 { v: self.v ^ rhs.v }
+    }
+}
+
+impl Sub<&GF2x64> for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: &GF2x64) -> GF2x64 {
+        GF2x64 { v: self.v ^ rhs.v }
+    }
+}
+
+impl<T: std::borrow::Borrow<GF2x64>> std::iter::Sum<T> for GF2x64 {
+    fn sum<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::zero(), |acc, item| acc + item.borrow())
+    }
+}
+
+impl<T: std::borrow::Borrow<GF2x64>> std::iter::Product<T> for GF2x64 {
+    fn product<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::one(), |acc, item| acc * item.borrow())
+    }
+}
+
+impl Neg for GF2x64 {
+    type Output = GF2x64;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn neg(self) -> GF2x64 {
+        GF2x64 { v: self.v }
+    }
+}
+
+impl AddAssign<&GF2x64> for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: &GF2x64) {
+        self.v ^= rhs.v;
+    }
+}
+
+impl SubAssign<&GF2x64> for GF2x64 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: &GF2x64) {
+        self.v ^= rhs.v;
+    }
+}
+
+impl From<u32> for GF2x64 {
+    #[inline(always)]
+    fn from(v: u32) -> Self {
+        assert!(v < 2);
+        if v == 0 {
+            GF2x64 { v: 0 }
+        } else {
+            GF2x64 { v: !0u64 }
+        }
+    }
+}
+
+impl From<GF2> for GF2x64 {
+    #[inline(always)]
+    fn from(v: GF2) -> Self {
+        assert!(v.v < 2);
+        if v.v == 0 {
+            GF2x64 { v: 0 }
+        } else {
+            GF2x64 { v: !0u64 }
+        }
+    }
+}
+
+impl SimdField for GF2x64 {
+    #[inline(always)]
+    fn scale(&self, challenge: &Self::Scalar) -> Self {
+        if challenge.v == 0 {
+            Self::zero()
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn pack(base_vec: &[Self::Scalar]) -> Self {
+        assert!(base_vec.len() == Self::PACK_SIZE);
+        let mut ret = 0u64;
+        for (i, scalar) in base_vec.iter().enumerate() {
+            ret |= (scalar.v as u64) << (Self::PACK_SIZE - 1 - i);
+        }
+        Self { v: ret }
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> Vec<Self::Scalar> {
+        let mut ret = vec![];
+        for i in 0..Self::PACK_SIZE {
+            ret.push(Self::Scalar {
+                v: ((self.v >> (Self::PACK_SIZE - 1 - i)) & 1u64) as u8,
+            });
+        }
+        ret
+    }
+
+    type Scalar = crate::GF2;
+
+    const PACK_SIZE: usize = 64;
+}
diff --git a/arith/gf2/src/gf2x8.rs b/arith/gf2/src/gf2x8.rs
index 8bb9e057..56ad8064 100644
--- a/arith/gf2/src/gf2x8.rs
+++ b/arith/gf2/src/gf2x8.rs
@@ -38,7 +38,7 @@ impl FieldSerde for GF2x8 {
 impl Field for GF2x8 {
     // still will pack 8 bits into a u8
 
-    const NAME: &'static str = "Galios Field 2 SIMD";
+    const NAME: &'static str = "Galios Field 2 SIMD 8";
 
     const SIZE: usize = 1;
 
@@ -278,14 +278,9 @@ impl SimdField for GF2x8 {
         }
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        8
-    }
-
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
-        assert!(base_vec.len() == Self::pack_size());
+        assert!(base_vec.len() == Self::PACK_SIZE);
         let mut ret = 0u8;
         for (i, scalar) in base_vec.iter().enumerate() {
             ret |= scalar.v << (7 - i);
@@ -296,7 +291,7 @@ impl SimdField for GF2x8 {
     #[inline(always)]
     fn unpack(&self) -> Vec<Self::Scalar> {
         let mut ret = vec![];
-        for i in 0..Self::pack_size() {
+        for i in 0..Self::PACK_SIZE {
             ret.push(Self::Scalar {
                 v: (self.v >> (7 - i)) & 1u8,
             });
@@ -305,4 +300,6 @@ impl SimdField for GF2x8 {
     }
 
     type Scalar = crate::GF2;
+
+    const PACK_SIZE: usize = 8;
 }
diff --git a/arith/gf2/src/lib.rs b/arith/gf2/src/lib.rs
index 5a4dcaf0..94edd4a7 100644
--- a/arith/gf2/src/lib.rs
+++ b/arith/gf2/src/lib.rs
@@ -6,5 +6,8 @@ pub use gf2::GF2;
 mod gf2x8;
 pub use gf2x8::GF2x8;
 
+mod gf2x64;
+pub use gf2x64::GF2x64;
+
 #[cfg(test)]
 mod tests;
diff --git a/arith/gf2/src/tests.rs b/arith/gf2/src/tests.rs
index aec115b3..f3364700 100644
--- a/arith/gf2/src/tests.rs
+++ b/arith/gf2/src/tests.rs
@@ -1,9 +1,11 @@
 use ark_std::test_rng;
 use std::io::Cursor;
 
-use arith::{random_field_tests, random_inversion_tests, random_simd_field_tests, FieldSerde};
+use arith::{
+    random_field_tests, random_inversion_tests, random_simd_field_tests, Field, FieldSerde,
+};
 
-use crate::{GF2x8, GF2};
+use crate::{GF2x64, GF2x8, GF2};
 
 #[test]
 fn test_field() {
@@ -17,16 +19,24 @@ fn test_field() {
 fn test_simd_field() {
     random_field_tests::<GF2x8>("Vectorized GF2".to_string());
     random_simd_field_tests::<GF2x8>("Vectorized GF2".to_string());
+
+    random_field_tests::<GF2x64>("Vectorized GF2 len 64".to_string());
+    random_simd_field_tests::<GF2x64>("Vectorized GF2 len 64".to_string());
 }
 
-#[test]
-fn test_custom_serde_vectorize_gf2() {
-    let a = GF2x8::from(0);
+fn custom_serde_vectorize_gf2<F: Field + FieldSerde>() {
+    let a = F::from(0);
     let mut buffer = vec![];
     assert!(a.serialize_into(&mut buffer).is_ok());
     let mut cursor = Cursor::new(buffer);
-    let b = GF2x8::deserialize_from(&mut cursor);
+    let b = F::deserialize_from(&mut cursor);
     assert!(b.is_ok());
     let b = b.unwrap();
     assert_eq!(a, b);
 }
+
+#[test]
+fn test_custom_serde_vectorize_gf2() {
+    custom_serde_vectorize_gf2::<GF2x8>();
+    custom_serde_vectorize_gf2::<GF2x64>()
+}
diff --git a/arith/gf2_128/src/gf2_ext128/avx.rs b/arith/gf2_128/src/gf2_ext128/avx.rs
index 10089cb2..97403998 100644
--- a/arith/gf2_128/src/gf2_ext128/avx.rs
+++ b/arith/gf2_128/src/gf2_ext128/avx.rs
@@ -5,9 +5,8 @@ use std::{
     ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
 };
 
-use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult};
-
-use gf2::GF2;
+use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, SimdField};
+use gf2::{GF2x64, GF2};
 
 #[derive(Debug, Clone, Copy)]
 pub struct AVXGF2_128 {
@@ -21,7 +20,9 @@ impl FieldSerde for AVXGF2_128 {
 
     #[inline(always)]
     fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
-        unsafe { writer.write_all(transmute::<__m128i, [u8; 16]>(self.v).as_ref())? };
+        unsafe {
+            writer.write_all(transmute::<__m128i, [u8; Self::SERIALIZED_SIZE]>(self.v).as_ref())?
+        };
         Ok(())
     }
 
@@ -208,6 +209,15 @@ impl ExtensionField for AVXGF2_128 {
     }
 }
 
+impl Mul<GF2> for AVXGF2_128 {
+    type Output = AVXGF2_128;
+
+    #[inline(always)]
+    fn mul(self, rhs: GF2) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl From<GF2> for AVXGF2_128 {
     #[inline(always)]
     fn from(v: GF2) -> Self {
@@ -318,3 +328,41 @@ fn mul_internal(a: &AVXGF2_128, b: &AVXGF2_128) -> AVXGF2_128 {
         v: unsafe { gfmul(a.v, b.v) },
     }
 }
+
+impl SimdField for AVXGF2_128 {
+    type Scalar = GF2;
+
+    const PACK_SIZE: usize = 128;
+
+    #[inline(always)]
+    fn scale(&self, challenge: &Self::Scalar) -> Self {
+        if challenge.v == 0 {
+            Self::ZERO
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn pack(base_vec: &[Self::Scalar]) -> Self {
+        assert_eq!(base_vec.len(), Self::PACK_SIZE);
+        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
+        packed_to_gf2x64
+            .iter_mut()
+            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
+            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
+
+        unsafe { transmute(packed_to_gf2x64) }
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> Vec<Self::Scalar> {
+        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
+            unsafe { transmute(*self) };
+
+        packed_to_gf2x64
+            .iter()
+            .flat_map(|packed| packed.unpack())
+            .collect()
+    }
+}
diff --git a/arith/gf2_128/src/gf2_ext128/neon.rs b/arith/gf2_128/src/gf2_ext128/neon.rs
index 9e1f97db..792e7ad3 100644
--- a/arith/gf2_128/src/gf2_ext128/neon.rs
+++ b/arith/gf2_128/src/gf2_ext128/neon.rs
@@ -2,8 +2,8 @@ use std::iter::{Product, Sum};
 use std::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign};
 use std::{arch::aarch64::*, mem::transmute};
 
-use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult};
-use gf2::GF2;
+use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, SimdField};
+use gf2::{GF2x64, GF2};
 
 #[derive(Clone, Copy, Debug)]
 pub struct NeonGF2_128 {
@@ -203,6 +203,15 @@ impl ExtensionField for NeonGF2_128 {
     }
 }
 
+impl Mul<GF2> for NeonGF2_128 {
+    type Output = NeonGF2_128;
+
+    #[inline]
+    fn mul(self, rhs: GF2) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl From<GF2> for NeonGF2_128 {
     #[inline(always)]
     fn from(v: GF2) -> Self {
@@ -394,3 +403,41 @@ pub(crate) fn mul_by_x_internal(a: &uint32x4_t) -> uint32x4_t {
         vreinterpretq_u32_u64(res)
     }
 }
+
+impl SimdField for NeonGF2_128 {
+    type Scalar = GF2;
+
+    const PACK_SIZE: usize = 128;
+
+    #[inline(always)]
+    fn scale(&self, challenge: &Self::Scalar) -> Self {
+        if challenge.v == 0 {
+            Self::ZERO
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn pack(base_vec: &[Self::Scalar]) -> Self {
+        assert_eq!(base_vec.len(), Self::PACK_SIZE);
+        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
+        packed_to_gf2x64
+            .iter_mut()
+            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
+            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
+
+        unsafe { transmute(packed_to_gf2x64) }
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> Vec<Self::Scalar> {
+        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
+            unsafe { transmute(*self) };
+
+        packed_to_gf2x64
+            .iter()
+            .flat_map(|packed| packed.unpack())
+            .collect()
+    }
+}
diff --git a/arith/gf2_128/src/gf2_ext128x8/avx256.rs b/arith/gf2_128/src/gf2_ext128x8/avx256.rs
index 9ef2b183..f923fb54 100644
--- a/arith/gf2_128/src/gf2_ext128x8/avx256.rs
+++ b/arith/gf2_128/src/gf2_ext128x8/avx256.rs
@@ -473,10 +473,7 @@ impl SimdField for AVX256GF2_128x8 {
     }
     type Scalar = GF2_128;
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        8
-    }
+    const PACK_SIZE: usize = 8;
 
     fn pack(base_vec: &[Self::Scalar]) -> Self {
         assert!(base_vec.len() == 8);
@@ -680,6 +677,15 @@ impl ExtensionField for AVX256GF2_128x8 {
     }
 }
 
+impl Mul<GF2x8> for AVX256GF2_128x8 {
+    type Output = AVX256GF2_128x8;
+
+    #[inline]
+    fn mul(self, rhs: GF2x8) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl From<GF2x8> for AVX256GF2_128x8 {
     #[inline(always)]
     fn from(v: GF2x8) -> Self {
diff --git a/arith/gf2_128/src/gf2_ext128x8/avx512.rs b/arith/gf2_128/src/gf2_ext128x8/avx512.rs
index b41f98b6..37f89b2c 100644
--- a/arith/gf2_128/src/gf2_ext128x8/avx512.rs
+++ b/arith/gf2_128/src/gf2_ext128x8/avx512.rs
@@ -448,14 +448,11 @@ impl SimdField for AVX512GF2_128x8 {
     }
     type Scalar = GF2_128;
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        8
-    }
+    const PACK_SIZE: usize = 8;
 
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
-        assert!(base_vec.len() == 8);
+        assert_eq!(base_vec.len(), Self::PACK_SIZE);
         let base_vec_array: [Self::Scalar; 8] = base_vec.try_into().unwrap();
         unsafe { transmute(base_vec_array) }
     }
@@ -715,6 +712,15 @@ impl Mul<GF2> for AVX512GF2_128x8 {
     }
 }
 
+impl Mul<GF2x8> for AVX512GF2_128x8 {
+    type Output = AVX512GF2_128x8;
+
+    #[inline(always)]
+    fn mul(self, rhs: GF2x8) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl Add<GF2> for AVX512GF2_128x8 {
     type Output = AVX512GF2_128x8;
     #[inline(always)]
diff --git a/arith/gf2_128/src/gf2_ext128x8/neon.rs b/arith/gf2_128/src/gf2_ext128x8/neon.rs
index 04ba6909..cf49c66a 100644
--- a/arith/gf2_128/src/gf2_ext128x8/neon.rs
+++ b/arith/gf2_128/src/gf2_ext128x8/neon.rs
@@ -75,7 +75,7 @@ impl Field for NeonGF2_128x8 {
 
     const SIZE: usize = 16 * 8;
 
-    const FIELD_SIZE: usize = 128 * 8; // in bits
+    const FIELD_SIZE: usize = 128; // in bits
 
     const ZERO: Self = NeonGF2_128x8 {
         v: [unsafe { transmute::<[u32; 4], uint32x4_t>([0, 0, 0, 0]) }; 8],
@@ -200,10 +200,8 @@ impl SimdField for NeonGF2_128x8 {
             ],
         }
     }
-    #[inline(always)]
-    fn pack_size() -> usize {
-        8
-    }
+
+    const PACK_SIZE: usize = 8;
 
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
@@ -392,6 +390,15 @@ impl From<GF2x8> for NeonGF2_128x8 {
     }
 }
 
+impl Mul<GF2x8> for NeonGF2_128x8 {
+    type Output = NeonGF2_128x8;
+
+    #[inline]
+    fn mul(self, rhs: GF2x8) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl Mul<GF2> for NeonGF2_128x8 {
     type Output = NeonGF2_128x8;
 
diff --git a/arith/gf2_128/src/tests.rs b/arith/gf2_128/src/tests.rs
index 43be77ff..653a7604 100644
--- a/arith/gf2_128/src/tests.rs
+++ b/arith/gf2_128/src/tests.rs
@@ -13,7 +13,8 @@ use crate::{GF2_128x8, GF2_128};
 
 #[test]
 fn test_simd_field() {
-    random_simd_field_tests::<GF2_128x8>("Simd GF2 Ext128".to_string());
+    random_simd_field_tests::<GF2_128>("Simd for GF2 over GF2Ext128".to_string());
+    random_simd_field_tests::<GF2_128x8>("Simd for GF2Ext128 over GF2Ext128x8".to_string());
 }
 
 #[test]
diff --git a/arith/mersenne31/src/m31_ext.rs b/arith/mersenne31/src/m31_ext.rs
index 0e95ae5e..15b38397 100644
--- a/arith/mersenne31/src/m31_ext.rs
+++ b/arith/mersenne31/src/m31_ext.rs
@@ -207,6 +207,15 @@ impl ExtensionField for M31Ext3 {
     }
 }
 
+impl Mul<M31> for M31Ext3 {
+    type Output = M31Ext3;
+
+    #[inline(always)]
+    fn mul(self, rhs: M31) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl Add<M31> for M31Ext3 {
     type Output = M31Ext3;
 
diff --git a/arith/mersenne31/src/m31_ext3x16.rs b/arith/mersenne31/src/m31_ext3x16.rs
index 8de30cfc..85bebd4f 100644
--- a/arith/mersenne31/src/m31_ext3x16.rs
+++ b/arith/mersenne31/src/m31_ext3x16.rs
@@ -57,14 +57,11 @@ impl SimdField for M31Ext3x16 {
         *self * *challenge
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        M31x16::pack_size()
-    }
+    const PACK_SIZE: usize = M31x16::PACK_SIZE;
 
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
-        assert!(base_vec.len() == Self::pack_size());
+        assert!(base_vec.len() == Self::PACK_SIZE);
         let mut v0s = vec![];
         let mut v1s = vec![];
         let mut v2s = vec![];
@@ -137,6 +134,15 @@ impl ExtensionField for M31Ext3x16 {
     }
 }
 
+impl Mul<M31x16> for M31Ext3x16 {
+    type Output = M31Ext3x16;
+
+    #[inline]
+    fn mul(self, rhs: M31x16) -> Self::Output {
+        self.mul_by_base_field(&rhs)
+    }
+}
+
 impl From<M31Ext3> for M31Ext3x16 {
     #[inline(always)]
     fn from(x: M31Ext3) -> Self {
diff --git a/arith/mersenne31/src/m31x16/m31_avx256.rs b/arith/mersenne31/src/m31x16/m31_avx256.rs
index 537e3911..a8bfedc5 100644
--- a/arith/mersenne31/src/m31x16/m31_avx256.rs
+++ b/arith/mersenne31/src/m31x16/m31_avx256.rs
@@ -279,10 +279,7 @@ impl SimdField for AVXM31 {
         *self * *challenge
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        M31_PACK_SIZE
-    }
+    const PACK_SIZE: usize = M31_PACK_SIZE;
 
     fn pack(base_vec: &[Self::Scalar]) -> Self {
         assert_eq!(base_vec.len(), M31_PACK_SIZE);
diff --git a/arith/mersenne31/src/m31x16/m31_avx512.rs b/arith/mersenne31/src/m31x16/m31_avx512.rs
index bba45482..c2204a23 100644
--- a/arith/mersenne31/src/m31x16/m31_avx512.rs
+++ b/arith/mersenne31/src/m31x16/m31_avx512.rs
@@ -234,10 +234,7 @@ impl SimdField for AVXM31 {
         *self * *challenge
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        M31_PACK_SIZE
-    }
+    const PACK_SIZE: usize = M31_PACK_SIZE;
 
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
diff --git a/arith/mersenne31/src/m31x16/m31_neon.rs b/arith/mersenne31/src/m31x16/m31_neon.rs
index 9e0b046e..a61a7841 100644
--- a/arith/mersenne31/src/m31x16/m31_neon.rs
+++ b/arith/mersenne31/src/m31x16/m31_neon.rs
@@ -301,10 +301,7 @@ impl SimdField for NeonM31 {
         *self * packed_challenge
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        M31_PACK_SIZE
-    }
+    const PACK_SIZE: usize = M31_PACK_SIZE;
 
     #[inline(always)]
     fn pack(base_vec: &[Self::Scalar]) -> Self {
diff --git a/arith/polynomials/src/mle.rs b/arith/polynomials/src/mle.rs
index 02da940d..e53fcd73 100644
--- a/arith/polynomials/src/mle.rs
+++ b/arith/polynomials/src/mle.rs
@@ -9,8 +9,14 @@ pub struct MultiLinearPoly<F: Field> {
 }
 
 impl<F: Field> MultiLinearPoly<F> {
-    /// Sample a random polynomials.
+    #[inline]
+    pub fn new(evals: Vec<F>) -> Self {
+        assert!(evals.len().is_power_of_two());
 
+        Self { coeffs: evals }
+    }
+
+    /// Sample a random polynomials.
     #[inline]
     pub fn random(nv: usize, mut rng: impl RngCore) -> Self {
         let coeff = (0..1 << nv).map(|_| F::random_unsafe(&mut rng)).collect();
diff --git a/arith/src/bn254.rs b/arith/src/bn254.rs
index 3a36a2a3..2a7f8c9b 100644
--- a/arith/src/bn254.rs
+++ b/arith/src/bn254.rs
@@ -141,10 +141,7 @@ impl SimdField for Fr {
         vec![*self]
     }
 
-    #[inline(always)]
-    fn pack_size() -> usize {
-        1
-    }
+    const PACK_SIZE: usize = 1;
 }
 
 impl FieldSerde for Fr {
diff --git a/arith/src/extension_field.rs b/arith/src/extension_field.rs
index 5f9fe869..c1ab27d7 100644
--- a/arith/src/extension_field.rs
+++ b/arith/src/extension_field.rs
@@ -1,10 +1,14 @@
+use std::ops::Mul;
+
 use crate::{Field, FieldSerde};
 
 /// Configurations for Extension Field over
 /// - either the Binomial polynomial x^DEGREE - W
 /// - or the AES polynomial x^128 + x^7 + x^2 + x + 1
 //
-pub trait ExtensionField: From<Self::BaseField> + Field + FieldSerde {
+pub trait ExtensionField:
+    Mul<Self::BaseField> + From<Self::BaseField> + Field + FieldSerde
+{
     /// Degree of the Extension
     const DEGREE: usize;
 
diff --git a/arith/src/simd_field.rs b/arith/src/simd_field.rs
index e7042a7d..e1d697d5 100644
--- a/arith/src/simd_field.rs
+++ b/arith/src/simd_field.rs
@@ -5,6 +5,9 @@ pub trait SimdField: From<Self::Scalar> + Field + FieldSerde {
     /// Field for the challenge. Can be self.
     type Scalar: Field + FieldSerde + Send;
 
+    /// Pack size (width) for the SIMD instruction
+    const PACK_SIZE: usize;
+
     /// scale self with the challenge
     fn scale(&self, challenge: &Self::Scalar) -> Self;
 
@@ -13,6 +16,4 @@ pub trait SimdField: From<Self::Scalar> + Field + FieldSerde {
 
     /// unpack into a vector.
     fn unpack(&self) -> Vec<Self::Scalar>;
-
-    fn pack_size() -> usize;
 }
diff --git a/config/src/gkr_config.rs b/config/src/gkr_config.rs
index 6bac5a3e..5930df89 100644
--- a/config/src/gkr_config.rs
+++ b/config/src/gkr_config.rs
@@ -101,7 +101,7 @@ pub trait GKRConfig: Default + Debug + Clone + Send + Sync + 'static {
 
     /// The pack size for the simd circuit field, e.g., 16 for M31x16
     fn get_field_pack_size() -> usize {
-        Self::SimdCircuitField::pack_size()
+        Self::SimdCircuitField::PACK_SIZE
     }
 
     /// Evaluate the circuit values at the challenge
diff --git a/transcript/src/fiat_shamir_hash.rs b/transcript/src/fiat_shamir_hash.rs
index 268d1e35..208bdd56 100644
--- a/transcript/src/fiat_shamir_hash.rs
+++ b/transcript/src/fiat_shamir_hash.rs
@@ -1,3 +1,5 @@
+use std::fmt::Debug;
+
 use arith::{Field, FieldSerde};
 
 pub mod sha2_256;
@@ -9,7 +11,7 @@ pub use keccak_256::*;
 pub mod mimc;
 pub use mimc::*;
 
-pub trait FiatShamirBytesHash {
+pub trait FiatShamirBytesHash: Clone + Debug {
     /// The size of the hash output in bytes.
     const DIGEST_SIZE: usize;
 
@@ -23,7 +25,7 @@ pub trait FiatShamirBytesHash {
     fn hash_inplace(buffer: &mut [u8]);
 }
 
-pub trait FiatShamirFieldHash<F: Field + FieldSerde> {
+pub trait FiatShamirFieldHash<F: Field + FieldSerde>: Clone + Debug {
     /// Create a new hash instance.
     fn new() -> Self;
 
diff --git a/transcript/src/fiat_shamir_hash/keccak_256.rs b/transcript/src/fiat_shamir_hash/keccak_256.rs
index 21645eca..6db6bcd0 100644
--- a/transcript/src/fiat_shamir_hash/keccak_256.rs
+++ b/transcript/src/fiat_shamir_hash/keccak_256.rs
@@ -2,7 +2,7 @@ use tiny_keccak::{Hasher, Sha3};
 
 use super::FiatShamirBytesHash;
 
-#[derive(Clone, Default)]
+#[derive(Clone, Default, Debug)]
 pub struct Keccak256hasher {}
 
 impl FiatShamirBytesHash for Keccak256hasher {
diff --git a/transcript/src/transcript.rs b/transcript/src/transcript.rs
index 05da7d76..221c10a7 100644
--- a/transcript/src/transcript.rs
+++ b/transcript/src/transcript.rs
@@ -1,4 +1,4 @@
-use std::marker::PhantomData;
+use std::{fmt::Debug, marker::PhantomData};
 
 use arith::{Field, FieldSerde};
 
@@ -7,7 +7,7 @@ use crate::{
     Proof,
 };
 
-pub trait Transcript<F: Field + FieldSerde> {
+pub trait Transcript<F: Field + FieldSerde>: Clone + Debug {
     /// Create a new transcript.
     fn new() -> Self;
 
@@ -24,6 +24,18 @@ pub trait Transcript<F: Field + FieldSerde> {
     /// Use this function when you need some randomness other than the native field
     fn generate_challenge_u8_slice(&mut self, n_bytes: usize) -> Vec<u8>;
 
+    /// Generate a list of positions that we want to open the polynomial at.
+    #[inline]
+    fn generate_challenge_index_vector(&mut self, num_queries: usize) -> Vec<usize> {
+        let mut challenges = Vec::with_capacity(num_queries);
+        let mut buf = [0u8; 8];
+        for _ in 0..num_queries {
+            buf.copy_from_slice(self.generate_challenge_u8_slice(8).as_slice());
+            challenges.push(usize::from_le_bytes(buf));
+        }
+        challenges
+    }
+
     /// Generate a challenge vector.
     #[inline]
     fn generate_challenge_field_elements(&mut self, n: usize) -> Vec<F> {

From 5b5e645909c2619613d7e2ff7d5ed3a27787e425 Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Mon, 11 Nov 2024 15:23:04 -0500
Subject: [PATCH 2/6] working on moving simd stuffs from gf2_128 to gf2x128 -
 avx done, on neon - food first

---
 arith/gf2/src/gf2x128.rs      |   9 +
 arith/gf2/src/gf2x128/avx.rs  | 365 ++++++++++++++++++++++++++++++++++
 arith/gf2/src/gf2x128/neon.rs |  44 ++++
 arith/gf2/src/lib.rs          |   3 +
 4 files changed, 421 insertions(+)
 create mode 100644 arith/gf2/src/gf2x128.rs
 create mode 100644 arith/gf2/src/gf2x128/avx.rs
 create mode 100644 arith/gf2/src/gf2x128/neon.rs

diff --git a/arith/gf2/src/gf2x128.rs b/arith/gf2/src/gf2x128.rs
new file mode 100644
index 00000000..4a150bab
--- /dev/null
+++ b/arith/gf2/src/gf2x128.rs
@@ -0,0 +1,9 @@
+#[cfg(target_arch = "x86_64")]
+mod avx;
+#[cfg(target_arch = "x86_64")]
+pub type GF2x128 = avx::AVXGF2x128;
+
+#[cfg(target_arch = "aarch64")]
+mod neon;
+#[cfg(target_arch = "aarch64")]
+pub type GF2x128 = neon::NeonGF2x128;
diff --git a/arith/gf2/src/gf2x128/avx.rs b/arith/gf2/src/gf2x128/avx.rs
new file mode 100644
index 00000000..c3d06ef3
--- /dev/null
+++ b/arith/gf2/src/gf2x128/avx.rs
@@ -0,0 +1,365 @@
+use std::{
+    arch::x86_64::*,
+    mem::{transmute, zeroed},
+    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
+};
+
+use arith::{Field, FieldSerde, FieldSerdeResult, SimdField};
+
+use crate::{GF2x64, GF2};
+
+#[derive(Debug, Clone, Copy)]
+pub struct AVXGF2x128 {
+    pub v: __m128i,
+}
+
+impl FieldSerde for AVXGF2x128 {
+    const SERIALIZED_SIZE: usize = 16;
+
+    #[inline(always)]
+    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
+        unsafe {
+            writer.write_all(transmute::<__m128i, [u8; Self::SERIALIZED_SIZE]>(self.v).as_ref())?
+        };
+        Ok(())
+    }
+
+    #[inline(always)]
+    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
+        let mut u = [0u8; Self::SERIALIZED_SIZE];
+        reader.read_exact(&mut u)?;
+        unsafe {
+            Ok(AVXGF2x128 {
+                v: transmute::<[u8; Self::SERIALIZED_SIZE], __m128i>(u),
+            })
+        }
+    }
+
+    #[inline(always)]
+    fn try_deserialize_from_ecc_format<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
+        let mut u = [0u8; 32];
+        reader.read_exact(&mut u)?;
+        Ok(unsafe {
+            AVXGF2x128 {
+                v: transmute::<[u8; 16], __m128i>(u[..16].try_into().unwrap()),
+            }
+        })
+    }
+}
+
+impl Field for AVXGF2x128 {
+    const NAME: &'static str = "Galios Field 2 SIMD 128";
+
+    const SIZE: usize = 16;
+
+    const FIELD_SIZE: usize = 1; // in bits
+
+    const ZERO: Self = AVXGF2x128 {
+        v: unsafe { zeroed() },
+    };
+
+    const ONE: Self = AVXGF2x128 {
+        v: unsafe { transmute([!0u64, !0u64]) },
+    };
+
+    const INV_2: Self = AVXGF2x128 {
+        v: unsafe { zeroed() },
+    }; // should not be used
+
+    #[inline(always)]
+    fn zero() -> Self {
+        AVXGF2x128 {
+            v: unsafe { zeroed() },
+        }
+    }
+
+    #[inline(always)]
+    fn one() -> Self {
+        AVXGF2x128 {
+            v: unsafe { transmute([!0u64, !0u64]) },
+        }
+    }
+
+    #[inline(always)]
+    fn is_zero(&self) -> bool {
+        unsafe { transmute::<__m128i, [u8; 16]>(self.v) == [0; 16] }
+    }
+
+    #[inline(always)]
+    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
+        let mut u = [0u8; 16];
+        rng.fill_bytes(&mut u);
+        unsafe {
+            AVXGF2x128 {
+                v: *(u.as_ptr() as *const __m128i),
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn random_bool(mut rng: impl rand::RngCore) -> Self {
+        let mut u = [0u8; 16];
+        rng.fill_bytes(&mut u);
+        unsafe {
+            AVXGF2x128 {
+                v: *(u.as_ptr() as *const __m128i),
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn exp(&self, exponent: u128) -> Self {
+        if exponent % 2 == 0 {
+            AVXGF2x128::ONE
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn inv(&self) -> Option<Self> {
+        unimplemented!()
+    }
+
+    #[inline(always)]
+    fn as_u32_unchecked(&self) -> u32 {
+        unimplemented!("u32 for GF2x128 doesn't make sense")
+    }
+
+    #[inline(always)]
+    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
+        unsafe {
+            AVXGF2x128 {
+                v: transmute::<[u8; 16], __m128i>(bytes[..16].try_into().unwrap()),
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn mul_by_5(&self) -> Self {
+        *self
+    }
+
+    #[inline(always)]
+    fn mul_by_6(&self) -> Self {
+        Self::ZERO
+    }
+}
+
+impl Default for AVXGF2x128 {
+    #[inline(always)]
+    fn default() -> Self {
+        Self::ZERO
+    }
+}
+
+impl PartialEq for AVXGF2x128 {
+    #[inline(always)]
+    fn eq(&self, other: &Self) -> bool {
+        unsafe { _mm_test_all_ones(_mm_cmpeq_epi8(self.v, other.v)) == 1 }
+    }
+}
+
+impl Mul<&AVXGF2x128> for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: &AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_and_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<AVXGF2x128> for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_and_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl MulAssign<&AVXGF2x128> for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: &AVXGF2x128) {
+        self.v = unsafe { _mm_and_si128(self.v, rhs.v) };
+    }
+}
+
+impl MulAssign<AVXGF2x128> for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: AVXGF2x128) {
+        self.v = unsafe { _mm_and_si128(self.v, rhs.v) };
+    }
+}
+
+impl Sub for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_xor_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl SubAssign for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: AVXGF2x128) {
+        self.v = unsafe { _mm_xor_si128(self.v, rhs.v) };
+    }
+}
+
+impl Add for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_xor_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl AddAssign for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: AVXGF2x128) {
+        self.v = unsafe { _mm_xor_si128(self.v, rhs.v) };
+    }
+}
+
+impl Add<&AVXGF2x128> for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: &AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_xor_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl AddAssign<&AVXGF2x128> for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: &AVXGF2x128) {
+        self.v = unsafe { _mm_xor_si128(self.v, rhs.v) };
+    }
+}
+
+impl Sub<&AVXGF2x128> for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: &AVXGF2x128) -> AVXGF2x128 {
+        AVXGF2x128 {
+            v: unsafe { _mm_xor_si128(self.v, rhs.v) },
+        }
+    }
+}
+
+impl SubAssign<&AVXGF2x128> for AVXGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: &AVXGF2x128) {
+        self.v = unsafe { _mm_xor_si128(self.v, rhs.v) };
+    }
+}
+
+impl<T: std::borrow::Borrow<AVXGF2x128>> std::iter::Sum<T> for AVXGF2x128 {
+    fn sum<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::zero(), |acc, item| acc + item.borrow())
+    }
+}
+
+impl<T: std::borrow::Borrow<AVXGF2x128>> std::iter::Product<T> for AVXGF2x128 {
+    fn product<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::one(), |acc, item| acc * item.borrow())
+    }
+}
+
+impl Neg for AVXGF2x128 {
+    type Output = AVXGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn neg(self) -> AVXGF2x128 {
+        AVXGF2x128 { v: self.v }
+    }
+}
+
+impl From<u32> for AVXGF2x128 {
+    #[inline(always)]
+    fn from(v: u32) -> Self {
+        assert!(v < 2);
+        if v == 0 {
+            AVXGF2x128::ZERO
+        } else {
+            AVXGF2x128::ONE
+        }
+    }
+}
+
+impl From<GF2> for AVXGF2x128 {
+    #[inline(always)]
+    fn from(v: GF2) -> Self {
+        assert!(v.v < 2);
+        if v.v == 0 {
+            AVXGF2x128::ZERO
+        } else {
+            AVXGF2x128::ONE
+        }
+    }
+}
+
+impl SimdField for AVXGF2x128 {
+    type Scalar = GF2;
+
+    const PACK_SIZE: usize = 128;
+
+    #[inline(always)]
+    fn scale(&self, challenge: &Self::Scalar) -> Self {
+        if challenge.v == 0 {
+            Self::ZERO
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn pack(base_vec: &[Self::Scalar]) -> Self {
+        assert_eq!(base_vec.len(), Self::PACK_SIZE);
+        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
+        packed_to_gf2x64
+            .iter_mut()
+            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
+            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
+
+        unsafe { transmute(packed_to_gf2x64) }
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> Vec<Self::Scalar> {
+        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
+            unsafe { transmute(*self) };
+
+        packed_to_gf2x64
+            .iter()
+            .flat_map(|packed| packed.unpack())
+            .collect()
+    }
+}
diff --git a/arith/gf2/src/gf2x128/neon.rs b/arith/gf2/src/gf2x128/neon.rs
new file mode 100644
index 00000000..494463ff
--- /dev/null
+++ b/arith/gf2/src/gf2x128/neon.rs
@@ -0,0 +1,44 @@
+#[derive(Clone, Copy, Debug)]
+pub struct NeonGF2x128 {
+    pub(crate) v: uint32x4_t,
+}
+
+impl FieldSerde for NeonGF2_128 {
+    const SERIALIZED_SIZE: usize = 16;
+
+    #[inline(always)]
+    fn serialize_into<W: std::io::Write>(&self, mut writer: W) -> FieldSerdeResult<()> {
+        unsafe { writer.write_all(transmute::<uint32x4_t, [u8; 16]>(self.v).as_ref())? };
+        Ok(())
+    }
+
+    #[inline(always)]
+    fn deserialize_from<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self> {
+        let mut u = [0u8; 16];
+        reader.read_exact(&mut u)?;
+        unsafe {
+            Ok(NeonGF2_128 {
+                v: transmute::<[u8; 16], uint32x4_t>(u),
+            })
+        }
+    }
+
+    #[inline]
+    fn try_deserialize_from_ecc_format<R: std::io::Read>(mut reader: R) -> FieldSerdeResult<Self>
+    where
+        Self: Sized,
+    {
+        let mut u = [0u8; 32];
+        reader.read_exact(&mut u)?;
+        Ok(unsafe {
+            NeonGF2_128 {
+                v: transmute::<[u8; 16], uint32x4_t>(u[..16].try_into().unwrap()),
+            }
+        })
+    }
+}
+// TODO: FieldSerde
+
+// TODO: Field
+
+// TODO: SimdField
diff --git a/arith/gf2/src/lib.rs b/arith/gf2/src/lib.rs
index 94edd4a7..46de4995 100644
--- a/arith/gf2/src/lib.rs
+++ b/arith/gf2/src/lib.rs
@@ -9,5 +9,8 @@ pub use gf2x8::GF2x8;
 mod gf2x64;
 pub use gf2x64::GF2x64;
 
+// mod gf2x128;
+// pub use gf2x128::GF2x128;
+
 #[cfg(test)]
 mod tests;

From 10e2afc6aa01477bb70880b4b5e62db9ed0e6945 Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Mon, 11 Nov 2024 15:59:59 -0500
Subject: [PATCH 3/6] neon impl wrap up

---
 arith/gf2/src/gf2x128/neon.rs | 326 +++++++++++++++++++++++++++++++++-
 arith/gf2/src/lib.rs          |   4 +-
 2 files changed, 323 insertions(+), 7 deletions(-)

diff --git a/arith/gf2/src/gf2x128/neon.rs b/arith/gf2/src/gf2x128/neon.rs
index 494463ff..19403405 100644
--- a/arith/gf2/src/gf2x128/neon.rs
+++ b/arith/gf2/src/gf2x128/neon.rs
@@ -1,9 +1,19 @@
+use std::{
+    arch::aarch64::*,
+    mem::{transmute, zeroed},
+    ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
+};
+
+use arith::{Field, FieldSerde, FieldSerdeResult, SimdField};
+
+use crate::{GF2x64, GF2};
+
 #[derive(Clone, Copy, Debug)]
 pub struct NeonGF2x128 {
     pub(crate) v: uint32x4_t,
 }
 
-impl FieldSerde for NeonGF2_128 {
+impl FieldSerde for NeonGF2x128 {
     const SERIALIZED_SIZE: usize = 16;
 
     #[inline(always)]
@@ -17,7 +27,7 @@ impl FieldSerde for NeonGF2_128 {
         let mut u = [0u8; 16];
         reader.read_exact(&mut u)?;
         unsafe {
-            Ok(NeonGF2_128 {
+            Ok(NeonGF2x128 {
                 v: transmute::<[u8; 16], uint32x4_t>(u),
             })
         }
@@ -31,14 +41,320 @@ impl FieldSerde for NeonGF2_128 {
         let mut u = [0u8; 32];
         reader.read_exact(&mut u)?;
         Ok(unsafe {
-            NeonGF2_128 {
+            NeonGF2x128 {
                 v: transmute::<[u8; 16], uint32x4_t>(u[..16].try_into().unwrap()),
             }
         })
     }
 }
-// TODO: FieldSerde
 
-// TODO: Field
+impl Field for NeonGF2x128 {
+    const NAME: &'static str = "Galios Field 2 SIMD 128";
+
+    const SIZE: usize = 128 / 8;
+
+    const FIELD_SIZE: usize = 128; // in bits
+
+    const ZERO: Self = NeonGF2x128 {
+        v: unsafe { zeroed() },
+    };
+
+    const ONE: Self = NeonGF2x128 {
+        v: unsafe { transmute::<[u64; 2], uint32x4_t>([!0u64, !0u64]) },
+    };
+
+    const INV_2: Self = NeonGF2x128 {
+        v: unsafe { zeroed() },
+    }; // should not be used
+
+    #[inline(always)]
+    fn zero() -> Self {
+        NeonGF2x128 {
+            v: unsafe { zeroed() },
+        }
+    }
+
+    #[inline(always)]
+    fn one() -> Self {
+        NeonGF2x128 {
+            v: unsafe { transmute::<[u64; 2], uint32x4_t>([!0u64, !0u64]) },
+        }
+    }
+
+    #[inline(always)]
+    fn is_zero(&self) -> bool {
+        unsafe { transmute::<uint32x4_t, [u8; 16]>(self.v) == [0; 16] }
+    }
+
+    #[inline(always)]
+    fn random_unsafe(mut rng: impl rand::RngCore) -> Self {
+        let mut u = [0u8; 16];
+        rng.fill_bytes(&mut u);
+        unsafe {
+            NeonGF2x128 {
+                v: *(u.as_ptr() as *const uint32x4_t),
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn random_bool(mut rng: impl rand::RngCore) -> Self {
+        let mut u = [0u8; 16];
+        rng.fill_bytes(&mut u);
+        unsafe {
+            NeonGF2x128 {
+                v: *(u.as_ptr() as *const uint32x4_t),
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn exp(&self, exponent: u128) -> Self {
+        if exponent % 2 == 0 {
+            NeonGF2x128::ONE
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn inv(&self) -> Option<Self> {
+        unimplemented!()
+    }
+
+    #[inline(always)]
+    fn as_u32_unchecked(&self) -> u32 {
+        unimplemented!("u32 for GFx128 doesn't make sense")
+    }
+
+    #[inline(always)]
+    fn from_uniform_bytes(bytes: &[u8; 32]) -> Self {
+        unsafe {
+            NeonGF2x128 {
+                v: transmute::<[u8; 16], uint32x4_t>(bytes[..16].try_into().unwrap()),
+            }
+        }
+    }
+}
+
+impl Default for NeonGF2x128 {
+    #[inline(always)]
+    fn default() -> Self {
+        Self::ZERO
+    }
+}
+
+impl PartialEq for NeonGF2x128 {
+    #[inline(always)]
+    fn eq(&self, other: &Self) -> bool {
+        unsafe {
+            transmute::<uint32x4_t, [u8; 16]>(self.v) == transmute::<uint32x4_t, [u8; 16]>(other.v)
+        }
+    }
+}
+
+impl Mul<&NeonGF2x128> for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: &NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { vandq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl Mul<NeonGF2x128> for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn mul(self, rhs: NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { vandq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl MulAssign<&NeonGF2x128> for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: &NeonGF2x128) {
+        self.v = unsafe { vandq_u32(self.v, rhs.v) };
+    }
+}
+
+impl MulAssign<NeonGF2x128> for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn mul_assign(&mut self, rhs: NeonGF2x128) {
+        self.v = unsafe { vandq_u32(self.v, rhs.v) };
+    }
+}
+
+impl Sub for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { veorq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl SubAssign for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: NeonGF2x128) {
+        self.v = unsafe { veorq_u32(self.v, rhs.v) };
+    }
+}
+
+impl Add for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { veorq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl AddAssign for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: NeonGF2x128) {
+        self.v = unsafe { veorq_u32(self.v, rhs.v) };
+    }
+}
+
+impl Add<&NeonGF2x128> for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn add(self, rhs: &NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { veorq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl AddAssign<&NeonGF2x128> for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn add_assign(&mut self, rhs: &NeonGF2x128) {
+        self.v = unsafe { veorq_u32(self.v, rhs.v) };
+    }
+}
+
+impl Sub<&NeonGF2x128> for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn sub(self, rhs: &NeonGF2x128) -> NeonGF2x128 {
+        NeonGF2x128 {
+            v: unsafe { veorq_u32(self.v, rhs.v) },
+        }
+    }
+}
+
+impl SubAssign<&NeonGF2x128> for NeonGF2x128 {
+    #[inline(always)]
+    #[allow(clippy::suspicious_op_assign_impl)]
+    fn sub_assign(&mut self, rhs: &NeonGF2x128) {
+        self.v = unsafe { veorq_u32(self.v, rhs.v) };
+    }
+}
+
+impl<T: std::borrow::Borrow<NeonGF2x128>> std::iter::Sum<T> for NeonGF2x128 {
+    fn sum<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::zero(), |acc, item| acc + item.borrow())
+    }
+}
+
+impl<T: std::borrow::Borrow<NeonGF2x128>> std::iter::Product<T> for NeonGF2x128 {
+    fn product<I: Iterator<Item = T>>(iter: I) -> Self {
+        iter.fold(Self::one(), |acc, item| acc * item.borrow())
+    }
+}
+
+impl Neg for NeonGF2x128 {
+    type Output = NeonGF2x128;
+
+    #[inline(always)]
+    #[allow(clippy::suspicious_arithmetic_impl)]
+    fn neg(self) -> NeonGF2x128 {
+        NeonGF2x128 { v: self.v }
+    }
+}
+
+impl From<u32> for NeonGF2x128 {
+    #[inline(always)]
+    fn from(v: u32) -> Self {
+        assert!(v < 2);
+        if v == 0 {
+            NeonGF2x128::ZERO
+        } else {
+            NeonGF2x128::ONE
+        }
+    }
+}
+
+impl From<GF2> for NeonGF2x128 {
+    #[inline(always)]
+    fn from(v: GF2) -> Self {
+        assert!(v.v < 2);
+        if v.v == 0 {
+            NeonGF2x128::ZERO
+        } else {
+            NeonGF2x128::ONE
+        }
+    }
+}
 
 // TODO: SimdField
+
+impl SimdField for NeonGF2x128 {
+    type Scalar = GF2;
+
+    const PACK_SIZE: usize = 128;
+
+    #[inline(always)]
+    fn scale(&self, challenge: &Self::Scalar) -> Self {
+        if challenge.v == 0 {
+            Self::ZERO
+        } else {
+            *self
+        }
+    }
+
+    #[inline(always)]
+    fn pack(base_vec: &[Self::Scalar]) -> Self {
+        assert_eq!(base_vec.len(), Self::PACK_SIZE);
+        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
+        packed_to_gf2x64
+            .iter_mut()
+            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
+            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
+
+        unsafe { transmute(packed_to_gf2x64) }
+    }
+
+    #[inline(always)]
+    fn unpack(&self) -> Vec<Self::Scalar> {
+        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
+            unsafe { transmute(*self) };
+
+        packed_to_gf2x64
+            .iter()
+            .flat_map(|packed| packed.unpack())
+            .collect()
+    }
+}
diff --git a/arith/gf2/src/lib.rs b/arith/gf2/src/lib.rs
index 46de4995..9c422663 100644
--- a/arith/gf2/src/lib.rs
+++ b/arith/gf2/src/lib.rs
@@ -9,8 +9,8 @@ pub use gf2x8::GF2x8;
 mod gf2x64;
 pub use gf2x64::GF2x64;
 
-// mod gf2x128;
-// pub use gf2x128::GF2x128;
+mod gf2x128;
+pub use gf2x128::GF2x128;
 
 #[cfg(test)]
 mod tests;

From 4fd26abc44b17fa828b34d4bccf3a2eba86029d4 Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Mon, 11 Nov 2024 16:04:39 -0500
Subject: [PATCH 4/6] remove simdfield impl in gf0_128

---
 arith/gf2/src/gf2x128/avx.rs         |  4 +--
 arith/gf2/src/gf2x128/neon.rs        |  2 --
 arith/gf2/src/tests.rs               |  8 ++++--
 arith/gf2_128/src/gf2_ext128/avx.rs  | 42 ++--------------------------
 arith/gf2_128/src/gf2_ext128/neon.rs | 42 ++--------------------------
 arith/gf2_128/src/tests.rs           |  3 +-
 6 files changed, 13 insertions(+), 88 deletions(-)

diff --git a/arith/gf2/src/gf2x128/avx.rs b/arith/gf2/src/gf2x128/avx.rs
index c3d06ef3..566c7532 100644
--- a/arith/gf2/src/gf2x128/avx.rs
+++ b/arith/gf2/src/gf2x128/avx.rs
@@ -59,7 +59,7 @@ impl Field for AVXGF2x128 {
     };
 
     const ONE: Self = AVXGF2x128 {
-        v: unsafe { transmute([!0u64, !0u64]) },
+        v: unsafe { transmute::<[u64; 2], __m128i>([!0u64, !0u64]) },
     };
 
     const INV_2: Self = AVXGF2x128 {
@@ -76,7 +76,7 @@ impl Field for AVXGF2x128 {
     #[inline(always)]
     fn one() -> Self {
         AVXGF2x128 {
-            v: unsafe { transmute([!0u64, !0u64]) },
+            v: unsafe { transmute::<[u64; 2], __m128i>([!0u64, !0u64]) },
         }
     }
 
diff --git a/arith/gf2/src/gf2x128/neon.rs b/arith/gf2/src/gf2x128/neon.rs
index 19403405..c3f7b3be 100644
--- a/arith/gf2/src/gf2x128/neon.rs
+++ b/arith/gf2/src/gf2x128/neon.rs
@@ -319,8 +319,6 @@ impl From<GF2> for NeonGF2x128 {
     }
 }
 
-// TODO: SimdField
-
 impl SimdField for NeonGF2x128 {
     type Scalar = GF2;
 
diff --git a/arith/gf2/src/tests.rs b/arith/gf2/src/tests.rs
index f3364700..b6700b24 100644
--- a/arith/gf2/src/tests.rs
+++ b/arith/gf2/src/tests.rs
@@ -5,7 +5,7 @@ use arith::{
     random_field_tests, random_inversion_tests, random_simd_field_tests, Field, FieldSerde,
 };
 
-use crate::{GF2x64, GF2x8, GF2};
+use crate::{GF2x128, GF2x64, GF2x8, GF2};
 
 #[test]
 fn test_field() {
@@ -22,6 +22,9 @@ fn test_simd_field() {
 
     random_field_tests::<GF2x64>("Vectorized GF2 len 64".to_string());
     random_simd_field_tests::<GF2x64>("Vectorized GF2 len 64".to_string());
+
+    random_field_tests::<GF2x128>("Vectorized GF2 len 128".to_string());
+    random_simd_field_tests::<GF2x128>("Vectorized GF2 len 128".to_string());
 }
 
 fn custom_serde_vectorize_gf2<F: Field + FieldSerde>() {
@@ -38,5 +41,6 @@ fn custom_serde_vectorize_gf2<F: Field + FieldSerde>() {
 #[test]
 fn test_custom_serde_vectorize_gf2() {
     custom_serde_vectorize_gf2::<GF2x8>();
-    custom_serde_vectorize_gf2::<GF2x64>()
+    custom_serde_vectorize_gf2::<GF2x64>();
+    custom_serde_vectorize_gf2::<GF2x128>()
 }
diff --git a/arith/gf2_128/src/gf2_ext128/avx.rs b/arith/gf2_128/src/gf2_ext128/avx.rs
index 97403998..03990aa2 100644
--- a/arith/gf2_128/src/gf2_ext128/avx.rs
+++ b/arith/gf2_128/src/gf2_ext128/avx.rs
@@ -5,8 +5,8 @@ use std::{
     ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign},
 };
 
-use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, SimdField};
-use gf2::{GF2x64, GF2};
+use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult};
+use gf2::GF2;
 
 #[derive(Debug, Clone, Copy)]
 pub struct AVXGF2_128 {
@@ -328,41 +328,3 @@ fn mul_internal(a: &AVXGF2_128, b: &AVXGF2_128) -> AVXGF2_128 {
         v: unsafe { gfmul(a.v, b.v) },
     }
 }
-
-impl SimdField for AVXGF2_128 {
-    type Scalar = GF2;
-
-    const PACK_SIZE: usize = 128;
-
-    #[inline(always)]
-    fn scale(&self, challenge: &Self::Scalar) -> Self {
-        if challenge.v == 0 {
-            Self::ZERO
-        } else {
-            *self
-        }
-    }
-
-    #[inline(always)]
-    fn pack(base_vec: &[Self::Scalar]) -> Self {
-        assert_eq!(base_vec.len(), Self::PACK_SIZE);
-        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
-        packed_to_gf2x64
-            .iter_mut()
-            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
-            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
-
-        unsafe { transmute(packed_to_gf2x64) }
-    }
-
-    #[inline(always)]
-    fn unpack(&self) -> Vec<Self::Scalar> {
-        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
-            unsafe { transmute(*self) };
-
-        packed_to_gf2x64
-            .iter()
-            .flat_map(|packed| packed.unpack())
-            .collect()
-    }
-}
diff --git a/arith/gf2_128/src/gf2_ext128/neon.rs b/arith/gf2_128/src/gf2_ext128/neon.rs
index 792e7ad3..ea1b528b 100644
--- a/arith/gf2_128/src/gf2_ext128/neon.rs
+++ b/arith/gf2_128/src/gf2_ext128/neon.rs
@@ -2,8 +2,8 @@ use std::iter::{Product, Sum};
 use std::ops::{Add, AddAssign, Mul, MulAssign, Neg, Sub, SubAssign};
 use std::{arch::aarch64::*, mem::transmute};
 
-use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult, SimdField};
-use gf2::{GF2x64, GF2};
+use arith::{field_common, ExtensionField, Field, FieldSerde, FieldSerdeResult};
+use gf2::GF2;
 
 #[derive(Clone, Copy, Debug)]
 pub struct NeonGF2_128 {
@@ -403,41 +403,3 @@ pub(crate) fn mul_by_x_internal(a: &uint32x4_t) -> uint32x4_t {
         vreinterpretq_u32_u64(res)
     }
 }
-
-impl SimdField for NeonGF2_128 {
-    type Scalar = GF2;
-
-    const PACK_SIZE: usize = 128;
-
-    #[inline(always)]
-    fn scale(&self, challenge: &Self::Scalar) -> Self {
-        if challenge.v == 0 {
-            Self::ZERO
-        } else {
-            *self
-        }
-    }
-
-    #[inline(always)]
-    fn pack(base_vec: &[Self::Scalar]) -> Self {
-        assert_eq!(base_vec.len(), Self::PACK_SIZE);
-        let mut packed_to_gf2x64 = [GF2x64::ZERO; Self::PACK_SIZE / GF2x64::PACK_SIZE];
-        packed_to_gf2x64
-            .iter_mut()
-            .zip(base_vec.chunks(GF2x64::PACK_SIZE))
-            .for_each(|(gf2x64, pack)| *gf2x64 = GF2x64::pack(pack));
-
-        unsafe { transmute(packed_to_gf2x64) }
-    }
-
-    #[inline(always)]
-    fn unpack(&self) -> Vec<Self::Scalar> {
-        let packed_to_gf2x64: [GF2x64; Self::PACK_SIZE / GF2x64::PACK_SIZE] =
-            unsafe { transmute(*self) };
-
-        packed_to_gf2x64
-            .iter()
-            .flat_map(|packed| packed.unpack())
-            .collect()
-    }
-}
diff --git a/arith/gf2_128/src/tests.rs b/arith/gf2_128/src/tests.rs
index 653a7604..43be77ff 100644
--- a/arith/gf2_128/src/tests.rs
+++ b/arith/gf2_128/src/tests.rs
@@ -13,8 +13,7 @@ use crate::{GF2_128x8, GF2_128};
 
 #[test]
 fn test_simd_field() {
-    random_simd_field_tests::<GF2_128>("Simd for GF2 over GF2Ext128".to_string());
-    random_simd_field_tests::<GF2_128x8>("Simd for GF2Ext128 over GF2Ext128x8".to_string());
+    random_simd_field_tests::<GF2_128x8>("Simd GF2 Ext128".to_string());
 }
 
 #[test]

From 33e9d8cc26ea048eb1be0e88ebd67d1345615ad4 Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Mon, 11 Nov 2024 16:45:46 -0500
Subject: [PATCH 5/6] minor, cleanup things, exp impl fixing

---
 arith/gf2/src/gf2x128/avx.rs  | 19 ++++---------------
 arith/gf2/src/gf2x128/neon.rs |  9 ++++-----
 arith/gf2/src/gf2x64.rs       |  7 +++----
 arith/gf2/src/gf2x8.rs        |  7 +++----
 4 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/arith/gf2/src/gf2x128/avx.rs b/arith/gf2/src/gf2x128/avx.rs
index 566c7532..64ad962d 100644
--- a/arith/gf2/src/gf2x128/avx.rs
+++ b/arith/gf2/src/gf2x128/avx.rs
@@ -50,7 +50,7 @@ impl FieldSerde for AVXGF2x128 {
 impl Field for AVXGF2x128 {
     const NAME: &'static str = "Galios Field 2 SIMD 128";
 
-    const SIZE: usize = 16;
+    const SIZE: usize = 128 / 8;
 
     const FIELD_SIZE: usize = 1; // in bits
 
@@ -109,11 +109,10 @@ impl Field for AVXGF2x128 {
 
     #[inline(always)]
     fn exp(&self, exponent: u128) -> Self {
-        if exponent % 2 == 0 {
-            AVXGF2x128::ONE
-        } else {
-            *self
+        if exponent == 0 {
+            return Self::one();
         }
+        *self
     }
 
     #[inline(always)]
@@ -134,16 +133,6 @@ impl Field for AVXGF2x128 {
             }
         }
     }
-
-    #[inline(always)]
-    fn mul_by_5(&self) -> Self {
-        *self
-    }
-
-    #[inline(always)]
-    fn mul_by_6(&self) -> Self {
-        Self::ZERO
-    }
 }
 
 impl Default for AVXGF2x128 {
diff --git a/arith/gf2/src/gf2x128/neon.rs b/arith/gf2/src/gf2x128/neon.rs
index c3f7b3be..7a015410 100644
--- a/arith/gf2/src/gf2x128/neon.rs
+++ b/arith/gf2/src/gf2x128/neon.rs
@@ -53,7 +53,7 @@ impl Field for NeonGF2x128 {
 
     const SIZE: usize = 128 / 8;
 
-    const FIELD_SIZE: usize = 128; // in bits
+    const FIELD_SIZE: usize = 1; // in bits
 
     const ZERO: Self = NeonGF2x128 {
         v: unsafe { zeroed() },
@@ -110,11 +110,10 @@ impl Field for NeonGF2x128 {
 
     #[inline(always)]
     fn exp(&self, exponent: u128) -> Self {
-        if exponent % 2 == 0 {
-            NeonGF2x128::ONE
-        } else {
-            *self
+        if exponent == 0 {
+            return Self::one();
         }
+        *self
     }
 
     #[inline(always)]
diff --git a/arith/gf2/src/gf2x64.rs b/arith/gf2/src/gf2x64.rs
index a03b594c..681ab92f 100644
--- a/arith/gf2/src/gf2x64.rs
+++ b/arith/gf2/src/gf2x64.rs
@@ -73,11 +73,10 @@ impl Field for GF2x64 {
 
     #[inline(always)]
     fn exp(&self, exponent: u128) -> Self {
-        if exponent % 2 == 0 {
-            Self::one()
-        } else {
-            *self
+        if exponent == 0 {
+            return Self::one();
         }
+        *self
     }
 
     #[inline(always)]
diff --git a/arith/gf2/src/gf2x8.rs b/arith/gf2/src/gf2x8.rs
index 56ad8064..6dcda1d3 100644
--- a/arith/gf2/src/gf2x8.rs
+++ b/arith/gf2/src/gf2x8.rs
@@ -81,11 +81,10 @@ impl Field for GF2x8 {
 
     #[inline(always)]
     fn exp(&self, exponent: u128) -> Self {
-        if exponent % 2 == 0 {
-            Self::one()
-        } else {
-            *self
+        if exponent == 0 {
+            return Self::one();
         }
+        *self
     }
 
     #[inline(always)]

From 32895d0040383013734756c86197495b818a785a Mon Sep 17 00:00:00 2001
From: Hang Su <tonyfloater@gmail.com>
Date: Mon, 11 Nov 2024 17:02:00 -0500
Subject: [PATCH 6/6] random message fixing

---
 gkr/src/main.rs     | 2 +-
 gkr/src/main_mpi.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gkr/src/main.rs b/gkr/src/main.rs
index a79f5b28..64a6cc32 100644
--- a/gkr/src/main.rs
+++ b/gkr/src/main.rs
@@ -168,7 +168,7 @@ fn run_benchmark<C: GKRConfig>(args: &Args, config: Config<C>) {
         })
         .collect::<Vec<_>>();
 
-    println!("We are now calculating average throughput, please wait for 1 minutes");
+    println!("We are now calculating average throughput, please wait for 5 seconds");
     for i in 0..args.repeats {
         thread::sleep(std::time::Duration::from_secs(5));
         let stop_time = std::time::Instant::now();
diff --git a/gkr/src/main_mpi.rs b/gkr/src/main_mpi.rs
index b0302a2d..57730eb1 100644
--- a/gkr/src/main_mpi.rs
+++ b/gkr/src/main_mpi.rs
@@ -125,7 +125,7 @@ fn run_benchmark<C: GKRConfig>(args: &Args, config: Config<C>) {
 
     const N_PROOF: usize = 1000;
 
-    println!("We are now calculating average throughput, please wait for 1 minutes");
+    println!("We are now calculating average throughput, please wait until {N_PROOF} proofs are computed");
     for i in 0..args.repeats {
         config.mpi_config.barrier(); // wait until everyone is here
         let start_time = std::time::Instant::now();