benching

ingonyama-zk · Jan 15, 2024 · b2ec8cf · b2ec8cf
1 parent ada2ea2
commit b2ec8cf
Show file tree

Hide file tree

Showing 9 changed files with 274 additions and 7 deletions.
diff --git a/ntt/kernels.cu b/ntt/kernels.cu
@@ -313,6 +313,68 @@ void coalesced_store(fr_t* inout, index_t idx, const fr_t r[z_count],
         inout[idx] = r[z];
 }
 
+__global__ void bench_mul_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
+{
+#ifdef __CUDA_ARCH__
+  // S f1 = group_gen;
+  // S f2 = f1 * group_gen_inverse;
+
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n)
+  {      
+    // int scalar_id = tid % n_scalars;
+    // element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+
+    fr_t t;
+
+    for (int s2 = 0; s2 < samples; s2++)
+    {
+      t = t * b;
+
+    }
+
+    t = a * t;
+
+    if (tid == 0)
+    {
+      *r = t;
+    }
+  }
+  #endif
+}
+
+__launch_bounds__(1024) __global__
+void bench_add_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
+{
+    #ifdef __CUDA_ARCH__
+  // S f1 = group_gen;
+  // S f2 = f1 * group_gen_inverse;
+
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  if (tid < n)
+  {
+    // int scalar_id = tid % n_scalars;
+    // element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];
+
+    fr_t t;
+    // for (int s1 = 0; s1 < samples; s1++)
+    // {
+    for (int s2 = 0; s2 < samples; s2++)
+    {
+      t = t + b;
+    }
+    // }
+
+    t = a + t;
+
+    if (tid == 0)
+    {
+      *r = t;
+    }
+  }
+  #endif
+}
+
 #if defined(FEATURE_BABY_BEAR) || defined(FEATURE_GOLDILOCKS)
 const static int Z_COUNT = 256/8/sizeof(fr_t);
 # include "kernels/gs_mixed_radix_narrow.cu"

diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh
@@ -291,5 +291,43 @@ public:
     }
 };
 
+extern "C" int bench_fr_add_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
+{
+    fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
+    fr_t f2 = forward_roots_of_unity[7];
+
+    fr_t h_answer;
+    fr_t *d_answer;
+    cudaMalloc(&d_answer, sizeof(fr_t));
+    CUDA_OK(cudaGetLastError());
+
+    bench_add_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);
+    CUDA_OK(cudaGetLastError());
+
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
+    cudaFree(d_answer);
+    return 0;
+}
+
+extern "C" int bench_fr_mul_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
+{
+    fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
+    fr_t f2 = forward_roots_of_unity[7];
+
+    fr_t h_answer;
+    fr_t *d_answer;
+    cudaMalloc(&d_answer, sizeof(fr_t));
+
+    bench_mul_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);
+
+    CUDA_OK(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+    cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
+    cudaFree(d_answer);
+    return 0;
+}
 #endif
 #endif
diff --git a/ntt/parameters.cuh b/ntt/parameters.cuh
@@ -14,7 +14,7 @@
 # elif defined(FEATURE_BABY_BEAR)
 #  define MAX_LG_DOMAIN_SIZE 27
 # else
-#  define MAX_LG_DOMAIN_SIZE 28 // tested only up to 2^31 for now
+#  define MAX_LG_DOMAIN_SIZE 30 // tested only up to 2^31 for now
 # endif
 #endif
 

diff --git a/poc/msm-cuda/Cargo.toml b/poc/msm-cuda/Cargo.toml
@@ -31,6 +31,7 @@ ark-ec = { version = "0.3.0", features = [ "parallel" ] }
 ark-bls12-381 = { version = "0.3.0", optional = true }
 ark-bls12-377 = { version = "0.3.0", optional = true }
 ark-bn254 = { version = "0.3.0", optional = true }
+rayon="*"
 
 [build-dependencies]
 cc = "^1.0.70"

diff --git a/poc/msm-cuda/src/util.rs b/poc/msm-cuda/src/util.rs
@@ -8,9 +8,20 @@ use rand_chacha::ChaCha20Rng;
 use ark_ec::{AffineCurve, ProjectiveCurve};
 use ark_std::UniformRand;
 
+use rayon::prelude::*;
+
 pub fn generate_points_scalars<G: AffineCurve>(
     len: usize,
 ) -> (Vec<G>, Vec<G::ScalarField>) {
+    generate_points_scalars_cond(len, true)
+}
+
+pub fn generate_points_scalars_cond<G: AffineCurve>(
+    len: usize,
+    is_with_p: bool,
+) -> (Vec<G>, Vec<G::ScalarField>) {
+    let scalars_len = len;
+    let mut len = if is_with_p { len } else { 3 };
     let rand_gen: usize = std::cmp::min(1usize << 11, len);
     let mut rng = ChaCha20Rng::from_entropy();
 
@@ -21,18 +32,23 @@ pub fn generate_points_scalars<G: AffineCurve>(
                 .collect::<Vec<_>>(),
         );
     // Sprinkle in some infinity points
-    if len > 2 {
+    if len > 3 {
         points[3] = G::zero();
     }
-    let scalars = (0..len)
-        .map(|_| G::ScalarField::rand(&mut rng))
-        .collect::<Vec<_>>();
 
     while points.len() < len {
         points.append(&mut points.clone());
     }
 
     points.truncate(len);
 
+    let scalars = (0..scalars_len)
+        .into_par_iter()
+        .map(|_| {
+            let mut rng = ChaCha20Rng::from_entropy();
+            G::ScalarField::rand(&mut rng)
+        })
+        .collect::<Vec<_>>();
+
     (points, scalars)
 }
diff --git a/poc/ntt-cuda/Cargo.toml b/poc/ntt-cuda/Cargo.toml
@@ -4,18 +4,22 @@ version = "0.1.0"
 edition = "2021"
 publish = false
 
+
+# [lib]
+# crate-type = ["rlib", "staticlib"]
+
 [features]
 # By default, compile with ADX extension if the host supports it.
 # Binary can be executed on systems similar to the host.
-default = []
+default = ["bls12_381"]
 # Compile in portable mode, without ISA extensions.
 # Binary can be executed on all systems.
 portable = [ "blst/portable" ]
 # Enable ADX even if the host CPU doesn't support it.
 # Binary can be executed on Broadwell+ and Ryzen+ systems.
 force-adx = [ "blst/force-adx" ]
 bls12_377 = []
-bls12_381 = []
+bls12_381 = ["msm-cuda/bls12_381"]
 pallas = [ "semolina" ]
 vesta = [ "semolina" ]
 bn254 = []
@@ -27,6 +31,7 @@ quiet = []
 blst = "~0.3.11"
 semolina = { version = "~0.1.2", optional = true }
 sppark = { path = "../../rust" }
+msm-cuda = { path = "../msm-cuda", features=["bls12_381"] }
 
 [build-dependencies]
 cc = "^1.0.70"
@@ -41,3 +46,8 @@ ark-bls12-377 = { version = "0.3.0" }
 ark-pallas = { version = "0.3.0" }
 ark-vesta = { version = "0.3.0" }
 ark-bn254 = { version = "0.3.0" }
+criterion = { version = "0.3", features = [ "html_reports" ] }
+
+[[bench]]
+name = "ntt"
+harness = false
diff --git a/poc/ntt-cuda/benches/ntt.rs b/poc/ntt-cuda/benches/ntt.rs
@@ -0,0 +1,74 @@
+// Copyright Supranational LLC
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+use criterion::{criterion_group, criterion_main, Criterion};
+
+#[cfg(feature = "bls12_377")]
+use ark_bls12_377::{G1Affine, G2Affine};
+#[cfg(feature = "bls12_381")]
+use ark_bls12_381::{G1Affine, G2Affine};
+#[cfg(feature = "bn254")]
+use ark_bn254::G1Affine;
+use ark_ff::BigInteger256;
+
+use std::str::FromStr;
+
+use ntt_cuda::*;
+use sppark::*;
+
+use msm_cuda::util;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let bench_npow = std::env::var("BENCH_NPOW").unwrap_or("28".to_string());
+    let npoints_npow = i32::from_str(&bench_npow).unwrap();
+
+    let mut group = c.benchmark_group("CUDA");
+    group.sample_size(20);
+
+    let name = format!("2**{}", npoints_npow);
+    group.bench_function(name, |b| {
+        let (mut points, mut scalars) =
+        util::generate_points_scalars_cond::<G1Affine>(1usize << npoints_npow, false);
+
+        b.iter(|| {
+            // let domain_size = 1usize << lg_domain_size;
+
+            // let domain = D::new(domain_size).unwrap();
+
+            // let mut v = vec![];
+            // for _ in 0..domain_size {
+            //     v.push(T::rand(rng));
+            // }
+
+            // v.resize(domain.size(), T::zero());
+            // let mut vtest = v.clone();
+
+            // domain.fft_in_place(&mut v);
+            ntt_cuda::NTT(0, &mut scalars.as_mut_slice(), NTTInputOutputOrder::RN);
+            // assert!(vtest == v);
+
+            // domain.ifft_in_place(&mut v);
+            // ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
+            // assert!(vtest == v);
+
+            // ntt_cuda::NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NR);
+            // ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::RN);
+            // assert!(vtest == v);
+
+            // domain.coset_fft_in_place(&mut v);
+            // ntt_cuda::coset_NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
+            // assert!(vtest == v);
+
+            // domain.coset_ifft_in_place(&mut v);
+            // ntt_cuda::coset_iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
+            // assert!(vtest == v);
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, criterion_benchmark);
+
+criterion_main!(benches);
diff --git a/poc/ntt-cuda/src/lib.rs b/poc/ntt-cuda/src/lib.rs
@@ -116,3 +116,63 @@ pub fn coset_iNTT<T>(
         panic!("{}", String::from(err));
     }
 }
+
+use std::time::Instant;
+
+extern "C" {    
+    fn bench_fr_add_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
+    fn bench_fr_sub_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
+    fn bench_fr_mul_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
+}
+
+pub fn bench_add_fr(samples: usize, blocks: usize, threads: usize) {
+    unsafe {
+        bench_fr_add_cuda(0, samples, blocks, threads);
+    }
+}
+
+pub fn bench_sub_fr(samples: usize, blocks: usize, threads: usize) {
+    unsafe {
+        bench_fr_sub_cuda(0, samples, blocks, threads);
+    }
+}
+
+pub fn bench_mul_fr(samples: usize, blocks: usize, threads: usize) {
+    unsafe {
+        bench_fr_mul_cuda(0, samples, blocks, threads);
+    }
+}
+
+pub fn arith_run() {
+    use std::str::FromStr;
+    let bench_npow = std::env::var("ARITH_BENCH_NPOW").unwrap_or("6".to_string());
+    let npoints_npow = usize::from_str(&bench_npow).unwrap();
+
+    for blocks in [128, 256, 1024] {
+        for threads in [128, 256, 1024] {
+            for lg_domain_size in 2..=npoints_npow {
+                let domain_size = 10_usize.pow(lg_domain_size as u32) as usize;
+                let count =  domain_size * blocks * threads;
+                let name = format!("FR ADD 10**{}*{}*{}", lg_domain_size, blocks, threads);
+                let start = Instant::now();
+                bench_add_fr(domain_size, blocks, threads);
+                let elapsed = start.elapsed();
+                println!(
+                    "{} = {:?} o/us",
+                    name,
+                    (count as f32) / elapsed.as_micros() as f32,
+                );
+
+                let name = format!("FR MUL 10**{}*{}*{}", lg_domain_size, blocks, threads);
+                let start = Instant::now();
+                bench_mul_fr(domain_size, blocks, threads);
+                let elapsed = start.elapsed();
+                println!(
+                    "{} = {:?} o/us",
+                    name,
+                    (count as f32) / elapsed.as_micros() as f32,
+                );
+            }
+        }
+    }
+}
diff --git a/poc/ntt-cuda/tests/ntt.rs b/poc/ntt-cuda/tests/ntt.rs
@@ -150,3 +150,9 @@ fn test_against_arkworks() {
 
     test_ntt::<Fr, Fr, _, GeneralEvaluationDomain<Fr>>(rng);
 }
+
+#[test]
+fn test_arith() {
+    use ntt_cuda::arith_run;
+    arith_run();
+}