Skip to content

Commit

Permalink
benching
Browse files Browse the repository at this point in the history
  • Loading branch information
Vitalii committed Jan 15, 2024
1 parent ada2ea2 commit b2ec8cf
Show file tree
Hide file tree
Showing 9 changed files with 274 additions and 7 deletions.
62 changes: 62 additions & 0 deletions ntt/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,68 @@ void coalesced_store(fr_t* inout, index_t idx, const fr_t r[z_count],
inout[idx] = r[z];
}

__global__ void bench_mul_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
{
#ifdef __CUDA_ARCH__
// S f1 = group_gen;
// S f2 = f1 * group_gen_inverse;

int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n)
{
// int scalar_id = tid % n_scalars;
// element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];

fr_t t;

for (int s2 = 0; s2 < samples; s2++)
{
t = t * b;

}

t = a * t;

if (tid == 0)
{
*r = t;
}
}
#endif
}

__launch_bounds__(1024) __global__
void bench_add_kernel(fr_t a, fr_t b, fr_t *r, size_t n, size_t samples)
{
#ifdef __CUDA_ARCH__
// S f1 = group_gen;
// S f2 = f1 * group_gen_inverse;

int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n)
{
// int scalar_id = tid % n_scalars;
// element_vec[tid] = scalar_vec[scalar_id] * element_vec[tid];

fr_t t;
// for (int s1 = 0; s1 < samples; s1++)
// {
for (int s2 = 0; s2 < samples; s2++)
{
t = t + b;
}
// }

t = a + t;

if (tid == 0)
{
*r = t;
}
}
#endif
}

#if defined(FEATURE_BABY_BEAR) || defined(FEATURE_GOLDILOCKS)
const static int Z_COUNT = 256/8/sizeof(fr_t);
# include "kernels/gs_mixed_radix_narrow.cu"
Expand Down
38 changes: 38 additions & 0 deletions ntt/ntt.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -291,5 +291,43 @@ public:
}
};

extern "C" int bench_fr_add_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
{
fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
fr_t f2 = forward_roots_of_unity[7];

fr_t h_answer;
fr_t *d_answer;
cudaMalloc(&d_answer, sizeof(fr_t));
CUDA_OK(cudaGetLastError());

bench_add_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);
CUDA_OK(cudaGetLastError());

cudaDeviceSynchronize();

cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
cudaFree(d_answer);
return 0;
}

extern "C" int bench_fr_mul_cuda(size_t device_id, size_t samples, size_t blocks, size_t threads)
{
fr_t f1 = forward_roots_of_unity[6]; // TODO: any value, random
fr_t f2 = forward_roots_of_unity[7];

fr_t h_answer;
fr_t *d_answer;
cudaMalloc(&d_answer, sizeof(fr_t));

bench_mul_kernel<<<blocks, threads>>>(f1, f2, d_answer, (size_t)(blocks * threads), samples);

CUDA_OK(cudaGetLastError());
cudaDeviceSynchronize();

cudaMemcpy(&h_answer, d_answer, sizeof(fr_t), cudaMemcpyDeviceToHost);
cudaFree(d_answer);
return 0;
}
#endif
#endif
2 changes: 1 addition & 1 deletion ntt/parameters.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# elif defined(FEATURE_BABY_BEAR)
# define MAX_LG_DOMAIN_SIZE 27
# else
# define MAX_LG_DOMAIN_SIZE 28 // tested only up to 2^31 for now
# define MAX_LG_DOMAIN_SIZE 30 // tested only up to 2^31 for now
# endif
#endif

Expand Down
1 change: 1 addition & 0 deletions poc/msm-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ ark-ec = { version = "0.3.0", features = [ "parallel" ] }
ark-bls12-381 = { version = "0.3.0", optional = true }
ark-bls12-377 = { version = "0.3.0", optional = true }
ark-bn254 = { version = "0.3.0", optional = true }
rayon="*"

[build-dependencies]
cc = "^1.0.70"
Expand Down
24 changes: 20 additions & 4 deletions poc/msm-cuda/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,20 @@ use rand_chacha::ChaCha20Rng;
use ark_ec::{AffineCurve, ProjectiveCurve};
use ark_std::UniformRand;

use rayon::prelude::*;

pub fn generate_points_scalars<G: AffineCurve>(
len: usize,
) -> (Vec<G>, Vec<G::ScalarField>) {
generate_points_scalars_cond(len, true)
}

pub fn generate_points_scalars_cond<G: AffineCurve>(
len: usize,
is_with_p: bool,
) -> (Vec<G>, Vec<G::ScalarField>) {
let scalars_len = len;
let mut len = if is_with_p { len } else { 3 };
let rand_gen: usize = std::cmp::min(1usize << 11, len);
let mut rng = ChaCha20Rng::from_entropy();

Expand All @@ -21,18 +32,23 @@ pub fn generate_points_scalars<G: AffineCurve>(
.collect::<Vec<_>>(),
);
// Sprinkle in some infinity points
if len > 2 {
if len > 3 {
points[3] = G::zero();
}
let scalars = (0..len)
.map(|_| G::ScalarField::rand(&mut rng))
.collect::<Vec<_>>();

while points.len() < len {
points.append(&mut points.clone());
}

points.truncate(len);

let scalars = (0..scalars_len)
.into_par_iter()
.map(|_| {
let mut rng = ChaCha20Rng::from_entropy();
G::ScalarField::rand(&mut rng)
})
.collect::<Vec<_>>();

(points, scalars)
}
14 changes: 12 additions & 2 deletions poc/ntt-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,22 @@ version = "0.1.0"
edition = "2021"
publish = false


# [lib]
# crate-type = ["rlib", "staticlib"]

[features]
# By default, compile with ADX extension if the host supports it.
# Binary can be executed on systems similar to the host.
default = []
default = ["bls12_381"]
# Compile in portable mode, without ISA extensions.
# Binary can be executed on all systems.
portable = [ "blst/portable" ]
# Enable ADX even if the host CPU doesn't support it.
# Binary can be executed on Broadwell+ and Ryzen+ systems.
force-adx = [ "blst/force-adx" ]
bls12_377 = []
bls12_381 = []
bls12_381 = ["msm-cuda/bls12_381"]
pallas = [ "semolina" ]
vesta = [ "semolina" ]
bn254 = []
Expand All @@ -27,6 +31,7 @@ quiet = []
blst = "~0.3.11"
semolina = { version = "~0.1.2", optional = true }
sppark = { path = "../../rust" }
msm-cuda = { path = "../msm-cuda", features=["bls12_381"] }

[build-dependencies]
cc = "^1.0.70"
Expand All @@ -41,3 +46,8 @@ ark-bls12-377 = { version = "0.3.0" }
ark-pallas = { version = "0.3.0" }
ark-vesta = { version = "0.3.0" }
ark-bn254 = { version = "0.3.0" }
criterion = { version = "0.3", features = [ "html_reports" ] }

[[bench]]
name = "ntt"
harness = false
74 changes: 74 additions & 0 deletions poc/ntt-cuda/benches/ntt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright Supranational LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

use criterion::{criterion_group, criterion_main, Criterion};

#[cfg(feature = "bls12_377")]
use ark_bls12_377::{G1Affine, G2Affine};
#[cfg(feature = "bls12_381")]
use ark_bls12_381::{G1Affine, G2Affine};
#[cfg(feature = "bn254")]
use ark_bn254::G1Affine;
use ark_ff::BigInteger256;

use std::str::FromStr;

use ntt_cuda::*;
use sppark::*;

use msm_cuda::util;

fn criterion_benchmark(c: &mut Criterion) {
let bench_npow = std::env::var("BENCH_NPOW").unwrap_or("28".to_string());
let npoints_npow = i32::from_str(&bench_npow).unwrap();

let mut group = c.benchmark_group("CUDA");
group.sample_size(20);

let name = format!("2**{}", npoints_npow);
group.bench_function(name, |b| {
let (mut points, mut scalars) =
util::generate_points_scalars_cond::<G1Affine>(1usize << npoints_npow, false);

b.iter(|| {
// let domain_size = 1usize << lg_domain_size;

// let domain = D::new(domain_size).unwrap();

// let mut v = vec![];
// for _ in 0..domain_size {
// v.push(T::rand(rng));
// }

// v.resize(domain.size(), T::zero());
// let mut vtest = v.clone();

// domain.fft_in_place(&mut v);
ntt_cuda::NTT(0, &mut scalars.as_mut_slice(), NTTInputOutputOrder::RN);
// assert!(vtest == v);

// domain.ifft_in_place(&mut v);
// ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);

// ntt_cuda::NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NR);
// ntt_cuda::iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::RN);
// assert!(vtest == v);

// domain.coset_fft_in_place(&mut v);
// ntt_cuda::coset_NTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);

// domain.coset_ifft_in_place(&mut v);
// ntt_cuda::coset_iNTT(DEFAULT_GPU, &mut vtest, NTTInputOutputOrder::NN);
// assert!(vtest == v);
})
});

group.finish();
}

criterion_group!(benches, criterion_benchmark);

criterion_main!(benches);
60 changes: 60 additions & 0 deletions poc/ntt-cuda/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,63 @@ pub fn coset_iNTT<T>(
panic!("{}", String::from(err));
}
}

use std::time::Instant;

extern "C" {
fn bench_fr_add_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
fn bench_fr_sub_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
fn bench_fr_mul_cuda(device_id: usize, samples: usize, blocks: usize, threads: usize) -> i32;
}

pub fn bench_add_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_add_cuda(0, samples, blocks, threads);
}
}

pub fn bench_sub_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_sub_cuda(0, samples, blocks, threads);
}
}

pub fn bench_mul_fr(samples: usize, blocks: usize, threads: usize) {
unsafe {
bench_fr_mul_cuda(0, samples, blocks, threads);
}
}

pub fn arith_run() {
use std::str::FromStr;
let bench_npow = std::env::var("ARITH_BENCH_NPOW").unwrap_or("6".to_string());
let npoints_npow = usize::from_str(&bench_npow).unwrap();

for blocks in [128, 256, 1024] {
for threads in [128, 256, 1024] {
for lg_domain_size in 2..=npoints_npow {
let domain_size = 10_usize.pow(lg_domain_size as u32) as usize;
let count = domain_size * blocks * threads;
let name = format!("FR ADD 10**{}*{}*{}", lg_domain_size, blocks, threads);
let start = Instant::now();
bench_add_fr(domain_size, blocks, threads);
let elapsed = start.elapsed();
println!(
"{} = {:?} o/us",
name,
(count as f32) / elapsed.as_micros() as f32,
);

let name = format!("FR MUL 10**{}*{}*{}", lg_domain_size, blocks, threads);
let start = Instant::now();
bench_mul_fr(domain_size, blocks, threads);
let elapsed = start.elapsed();
println!(
"{} = {:?} o/us",
name,
(count as f32) / elapsed.as_micros() as f32,
);
}
}
}
}
6 changes: 6 additions & 0 deletions poc/ntt-cuda/tests/ntt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,9 @@ fn test_against_arkworks() {

test_ntt::<Fr, Fr, _, GeneralEvaluationDomain<Fr>>(rng);
}

#[test]
fn test_arith() {
use ntt_cuda::arith_run;
arith_run();
}

0 comments on commit b2ec8cf

Please sign in to comment.