From 7ccbfbbc3a04f601c88f06c441595357a6a2c7a9 Mon Sep 17 00:00:00 2001 From: nonam3e Date: Wed, 26 Jun 2024 19:53:12 +0000 Subject: [PATCH] cleanup --- .github/workflows/cpp_cuda.yml | 2 + icicle/include/api/babybear.h | 69 ++------------- icicle/include/api/bls12_377.h | 43 +-------- icicle/include/api/bls12_381.h | 43 +-------- icicle/include/api/bn254.h | 43 +-------- icicle/include/api/bw6_761.h | 43 +-------- icicle/include/api/grumpkin.h | 35 +------- icicle/include/api/m31.h | 19 ++-- icicle/include/fields/quartic_extension.cuh | 9 +- icicle/include/fields/stark_fields/m31.cuh | 98 ++++++++------------- icicle/src/ntt/thread_ntt.cu | 20 ++--- 11 files changed, 85 insertions(+), 339 deletions(-) diff --git a/.github/workflows/cpp_cuda.yml b/.github/workflows/cpp_cuda.yml index 5fda8fc4a..bb57823af 100644 --- a/.github/workflows/cpp_cuda.yml +++ b/.github/workflows/cpp_cuda.yml @@ -73,6 +73,8 @@ jobs: build_args: -DEXT_FIELD=ON - name: stark252 build_args: -DEXT_FIELD=OFF + - name: m31 + build_args: -DEXT_FIELD=ON steps: - name: Checkout Repo uses: actions/checkout@v4 diff --git a/icicle/include/api/babybear.h b/icicle/include/api/babybear.h index 717796fea..9eca7f81a 100644 --- a/icicle/include/api/babybear.h +++ b/icicle/include/api/babybear.h @@ -38,6 +38,9 @@ extern "C" cudaError_t babybear_extension_mul_cuda( extern "C" cudaError_t babybear_extension_add_cuda( babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); +extern "C" cudaError_t babybear_extension_accumulate_cuda( + babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t babybear_extension_sub_cuda( babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); @@ -61,6 +64,9 @@ extern "C" cudaError_t babybear_mul_cuda( extern "C" cudaError_t babybear_add_cuda( babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); +extern "C" cudaError_t babybear_accumulate_cuda( + babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t babybear_sub_cuda( babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); @@ -104,67 +110,4 @@ extern "C" cudaError_t babybear_release_poseidon2_constants_cuda( poseidon2::Poseidon2Constants* constants, device_context::DeviceContext& ctx); -extern "C" cudaError_t babybear_mul_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_add_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_accumulate_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t babybear_sub_cuda( - babybear::scalar_t* vec_a, babybear::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::scalar_t* result); - -extern "C" cudaError_t babybear_transpose_matrix_cuda( - const babybear::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - babybear::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t babybear_bit_reverse_cuda( - const babybear::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - babybear::scalar_t* output); - -extern "C" void babybear_generate_scalars(babybear::scalar_t* scalars, int size); - -extern "C" cudaError_t babybear_scalar_convert_montgomery( - babybear::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t babybear_initialize_domain( - babybear::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t babybear_ntt_cuda( - const babybear::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, babybear::scalar_t* output); - -extern "C" cudaError_t babybear_release_domain(device_context::DeviceContext& ctx); - -extern "C" void babybear_extension_generate_scalars(babybear::extension_t* scalars, int size); - -extern "C" cudaError_t babybear_extension_scalar_convert_montgomery( - babybear::extension_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t babybear_extension_mul_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_add_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_sub_cuda( - babybear::extension_t* vec_a, babybear::extension_t* vec_b, int n, vec_ops::VecOpsConfig& config, babybear::extension_t* result); - -extern "C" cudaError_t babybear_extension_transpose_matrix_cuda( - const babybear::extension_t* input, - uint32_t row_size, - uint32_t column_size, - babybear::extension_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - #endif \ No newline at end of file diff --git a/icicle/include/api/bls12_377.h b/icicle/include/api/bls12_377.h index d7b2769d1..ab6b40c5e 100644 --- a/icicle/include/api/bls12_377.h +++ b/icicle/include/api/bls12_377.h @@ -84,6 +84,9 @@ extern "C" cudaError_t bls12_377_mul_cuda( extern "C" cudaError_t bls12_377_add_cuda( bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); +extern "C" cudaError_t bls12_377_accumulate_cuda( + bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bls12_377_sub_cuda( bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); @@ -123,44 +126,4 @@ extern "C" cudaError_t bls12_377_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bls12_377_mul_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_add_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_accumulate_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bls12_377_sub_cuda( - bls12_377::scalar_t* vec_a, bls12_377::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_377::scalar_t* result); - -extern "C" cudaError_t bls12_377_transpose_matrix_cuda( - const bls12_377::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bls12_377::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bls12_377_bit_reverse_cuda( - const bls12_377::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bls12_377::scalar_t* output); - -extern "C" void bls12_377_generate_scalars(bls12_377::scalar_t* scalars, int size); - -extern "C" cudaError_t bls12_377_scalar_convert_montgomery( - bls12_377::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_377_initialize_domain( - bls12_377::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bls12_377_ntt_cuda( - const bls12_377::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_377::scalar_t* output); - -extern "C" cudaError_t bls12_377_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bls12_381.h b/icicle/include/api/bls12_381.h index d1fb13f36..246432b1e 100644 --- a/icicle/include/api/bls12_381.h +++ b/icicle/include/api/bls12_381.h @@ -84,6 +84,9 @@ extern "C" cudaError_t bls12_381_mul_cuda( extern "C" cudaError_t bls12_381_add_cuda( bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); +extern "C" cudaError_t bls12_381_accumulate_cuda( + bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bls12_381_sub_cuda( bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); @@ -123,44 +126,4 @@ extern "C" cudaError_t bls12_381_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bls12_381_mul_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_add_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_accumulate_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bls12_381_sub_cuda( - bls12_381::scalar_t* vec_a, bls12_381::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bls12_381::scalar_t* result); - -extern "C" cudaError_t bls12_381_transpose_matrix_cuda( - const bls12_381::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bls12_381::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bls12_381_bit_reverse_cuda( - const bls12_381::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bls12_381::scalar_t* output); - -extern "C" void bls12_381_generate_scalars(bls12_381::scalar_t* scalars, int size); - -extern "C" cudaError_t bls12_381_scalar_convert_montgomery( - bls12_381::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bls12_381_initialize_domain( - bls12_381::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bls12_381_ntt_cuda( - const bls12_381::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bls12_381::scalar_t* output); - -extern "C" cudaError_t bls12_381_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bn254.h b/icicle/include/api/bn254.h index 29ac075a7..6a6d6a9f4 100644 --- a/icicle/include/api/bn254.h +++ b/icicle/include/api/bn254.h @@ -85,6 +85,9 @@ extern "C" cudaError_t bn254_mul_cuda( extern "C" cudaError_t bn254_add_cuda( bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); +extern "C" cudaError_t bn254_accumulate_cuda( + bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bn254_sub_cuda( bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); @@ -155,44 +158,4 @@ extern "C" cudaError_t bn254_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bn254_mul_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_add_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_accumulate_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bn254_sub_cuda( - bn254::scalar_t* vec_a, bn254::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bn254::scalar_t* result); - -extern "C" cudaError_t bn254_transpose_matrix_cuda( - const bn254::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bn254::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bn254_bit_reverse_cuda( - const bn254::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bn254::scalar_t* output); - -extern "C" void bn254_generate_scalars(bn254::scalar_t* scalars, int size); - -extern "C" cudaError_t bn254_scalar_convert_montgomery( - bn254::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bn254_initialize_domain( - bn254::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bn254_ntt_cuda( - const bn254::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bn254::scalar_t* output); - -extern "C" cudaError_t bn254_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/bw6_761.h b/icicle/include/api/bw6_761.h index 662af6fd0..153a9e992 100644 --- a/icicle/include/api/bw6_761.h +++ b/icicle/include/api/bw6_761.h @@ -84,6 +84,9 @@ extern "C" cudaError_t bw6_761_mul_cuda( extern "C" cudaError_t bw6_761_add_cuda( bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); +extern "C" cudaError_t bw6_761_accumulate_cuda( + bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t bw6_761_sub_cuda( bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); @@ -123,44 +126,4 @@ extern "C" cudaError_t bw6_761_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t bw6_761_mul_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_add_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_accumulate_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t bw6_761_sub_cuda( - bw6_761::scalar_t* vec_a, bw6_761::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, bw6_761::scalar_t* result); - -extern "C" cudaError_t bw6_761_transpose_matrix_cuda( - const bw6_761::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - bw6_761::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t bw6_761_bit_reverse_cuda( - const bw6_761::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - bw6_761::scalar_t* output); - -extern "C" void bw6_761_generate_scalars(bw6_761::scalar_t* scalars, int size); - -extern "C" cudaError_t bw6_761_scalar_convert_montgomery( - bw6_761::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - -extern "C" cudaError_t bw6_761_initialize_domain( - bw6_761::scalar_t* primitive_root, device_context::DeviceContext& ctx, bool fast_twiddles_mode); - -extern "C" cudaError_t bw6_761_ntt_cuda( - const bw6_761::scalar_t* input, int size, ntt::NTTDir dir, ntt::NTTConfig& config, bw6_761::scalar_t* output); - -extern "C" cudaError_t bw6_761_release_domain(device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/grumpkin.h b/icicle/include/api/grumpkin.h index fdce66984..8bc755575 100644 --- a/icicle/include/api/grumpkin.h +++ b/icicle/include/api/grumpkin.h @@ -49,6 +49,9 @@ extern "C" cudaError_t grumpkin_mul_cuda( extern "C" cudaError_t grumpkin_add_cuda( grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); +extern "C" cudaError_t grumpkin_accumulate_cuda( + grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t grumpkin_sub_cuda( grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); @@ -88,36 +91,4 @@ extern "C" cudaError_t grumpkin_build_poseidon_merkle_tree( poseidon::PoseidonConstants& constants, merkle::TreeBuilderConfig& config); -extern "C" cudaError_t grumpkin_mul_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_add_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_accumulate_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); - -extern "C" cudaError_t grumpkin_sub_cuda( - grumpkin::scalar_t* vec_a, grumpkin::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, grumpkin::scalar_t* result); - -extern "C" cudaError_t grumpkin_transpose_matrix_cuda( - const grumpkin::scalar_t* input, - uint32_t row_size, - uint32_t column_size, - grumpkin::scalar_t* output, - device_context::DeviceContext& ctx, - bool on_device, - bool is_async); - -extern "C" cudaError_t grumpkin_bit_reverse_cuda( - const grumpkin::scalar_t* input, - uint64_t n, - vec_ops::BitReverseConfig& config, - grumpkin::scalar_t* output); - -extern "C" void grumpkin_generate_scalars(grumpkin::scalar_t* scalars, int size); - -extern "C" cudaError_t grumpkin_scalar_convert_montgomery( - grumpkin::scalar_t* d_inout, size_t n, bool is_into, device_context::DeviceContext& ctx); - #endif \ No newline at end of file diff --git a/icicle/include/api/m31.h b/icicle/include/api/m31.h index 6fd9b7ed6..182300704 100644 --- a/icicle/include/api/m31.h +++ b/icicle/include/api/m31.h @@ -23,16 +23,19 @@ extern "C" cudaError_t m31_mul_cuda( extern "C" cudaError_t m31_add_cuda( m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result); +extern "C" cudaError_t m31_accumulate_cuda( + m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config); + extern "C" cudaError_t m31_sub_cuda( m31::scalar_t* vec_a, m31::scalar_t* vec_b, int n, vec_ops::VecOpsConfig& config, m31::scalar_t* result); -// extern "C" cudaError_t m31_transpose_matrix_cuda( -// const m31::scalar_t* input, -// uint32_t row_size, -// uint32_t column_size, -// m31::scalar_t* output, -// device_context::DeviceContext& ctx, -// bool on_device, -// bool is_async); +extern "C" cudaError_t m31_transpose_matrix_cuda( + const m31::scalar_t* input, + uint32_t row_size, + uint32_t column_size, + m31::scalar_t* output, + device_context::DeviceContext& ctx, + bool on_device, + bool is_async); #endif \ No newline at end of file diff --git a/icicle/include/fields/quartic_extension.cuh b/icicle/include/fields/quartic_extension.cuh index cbe7673ca..8fead58c5 100644 --- a/icicle/include/fields/quartic_extension.cuh +++ b/icicle/include/fields/quartic_extension.cuh @@ -49,17 +49,14 @@ public: static constexpr HOST_DEVICE_INLINE ExtensionField to_montgomery(const ExtensionField& xs) { return ExtensionField{ - FF::to_montgomery(xs.real), FF::to_montgomery(xs.im1), - FF::to_montgomery(xs.im2), FF::to_montgomery(xs.im3) - }; + FF::to_montgomery(xs.real), FF::to_montgomery(xs.im1), FF::to_montgomery(xs.im2), FF::to_montgomery(xs.im3)}; } static constexpr HOST_DEVICE_INLINE ExtensionField from_montgomery(const ExtensionField& xs) { return ExtensionField{ - FF::from_montgomery(xs.real), FF::from_montgomery(xs.im1), - FF::from_montgomery(xs.im2), FF::from_montgomery(xs.im3) - }; + FF::from_montgomery(xs.real), FF::from_montgomery(xs.im1), FF::from_montgomery(xs.im2), + FF::from_montgomery(xs.im3)}; } static HOST_INLINE ExtensionField rand_host() diff --git a/icicle/include/fields/stark_fields/m31.cuh b/icicle/include/fields/stark_fields/m31.cuh index af0d0d8e7..283808b0a 100644 --- a/icicle/include/fields/stark_fields/m31.cuh +++ b/icicle/include/fields/stark_fields/m31.cuh @@ -6,37 +6,27 @@ namespace m31 { template - class MersenneField: public Field { + class MersenneField : public Field + { public: - HOST_DEVICE_INLINE MersenneField(const MersenneField& other) : Field(other) {} HOST_DEVICE_INLINE MersenneField(uint32_t x = 0) : Field({x}) {} HOST_DEVICE_INLINE MersenneField(storage x) : Field{x} {} HOST_DEVICE_INLINE MersenneField(const Field& other) : Field(other) {} - static constexpr HOST_DEVICE_INLINE MersenneField zero() { - return MersenneField(CONFIG::zero.limbs[0]); - } + static constexpr HOST_DEVICE_INLINE MersenneField zero() { return MersenneField(CONFIG::zero.limbs[0]); } - static constexpr HOST_DEVICE_INLINE MersenneField one() { - return MersenneField(CONFIG::one.limbs[0]); - } + static constexpr HOST_DEVICE_INLINE MersenneField one() { return MersenneField(CONFIG::one.limbs[0]); } - static constexpr HOST_DEVICE_INLINE MersenneField from(uint32_t value) { - return MersenneField(value); - } - - static HOST_INLINE MersenneField rand_host() { - return MersenneField(Field::rand_host()); - } + static constexpr HOST_DEVICE_INLINE MersenneField from(uint32_t value) { return MersenneField(value); } + static HOST_INLINE MersenneField rand_host() { return MersenneField(Field::rand_host()); } // Define assignment operator - HOST_DEVICE_INLINE MersenneField& operator=(const Field& other) { - if (this != &other) { - Field::operator=(other); - } - return *this; + HOST_DEVICE_INLINE MersenneField& operator=(const Field& other) + { + if (this != &other) { Field::operator=(other); } + return *this; } // HOST_DEVICE_INLINE MersenneField& operator=(const uint32_t& other) { @@ -44,18 +34,18 @@ namespace m31 { // return *this; // } - HOST_DEVICE_INLINE uint32_t get_limb() const { - return this->limbs_storage.limbs[0]; - } + HOST_DEVICE_INLINE uint32_t get_limb() const { return this->limbs_storage.limbs[0]; } struct Wide { uint32_t storage; - static constexpr HOST_DEVICE_INLINE Wide from_field(const MersenneField& xs) { + static constexpr HOST_DEVICE_INLINE Wide from_field(const MersenneField& xs) + { Wide out{}; out.storage = xs.get_limb(); return out; } - static constexpr HOST_DEVICE_INLINE Wide from_number(const uint32_t& xs) { + static constexpr HOST_DEVICE_INLINE Wide from_number(const uint32_t& xs) + { Wide out{}; out.storage = xs; return out; @@ -72,10 +62,7 @@ namespace m31 { uint32_t r = ((uint32_t)((tmp >> 32)) << 1) + (uint32_t)(tmp); return from_number(r); } - friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) - { - return xs + neg(ys); - } + friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) { return xs + neg(ys); } friend HOST_DEVICE_INLINE Wide operator*(Wide xs, const Wide& ys) { uint64_t t1 = (uint64_t)xs.storage * ys.storage; @@ -97,12 +84,9 @@ namespace m31 { static constexpr HOST_DEVICE_INLINE uint32_t reduce_limbs(uint32_t t) { uint32_t m = MersenneField::get_modulus().limbs[0]; - if (t > m) - t = (t & m) + (t >> CONFIG::modulus_bit_count); - if (t > m) - t = (t & m) + (t >> CONFIG::modulus_bit_count); - if (t == m) - t = 0; + if (t > m) t = (t & m) + (t >> CONFIG::modulus_bit_count); + if (t > m) t = (t & m) + (t >> CONFIG::modulus_bit_count); + if (t == m) t = 0; return t; } @@ -123,7 +107,8 @@ namespace m31 { } template - static constexpr HOST_DEVICE_INLINE MersenneField reduce(Wide xs) { + static constexpr HOST_DEVICE_INLINE MersenneField reduce(Wide xs) + { uint32_t tmp = ((xs.storage >> 31) & 1) + (xs.storage & MersenneField::get_modulus().limbs[0]); return MersenneField(tmp == MersenneField::get_modulus().limbs[0] ? 0 : tmp); } @@ -135,23 +120,18 @@ namespace m31 { static constexpr HOST_DEVICE_INLINE uint32_t inverse_limbs(const uint32_t& xs) { - - if (xs <= 1) - return xs; + if (xs <= 1) return xs; uint32_t a = 1, b = 0, y = xs, z = MersenneField::get_modulus().limbs[0], e, m = z; - while (1) - { + while (1) { #ifdef __CUDA_ARCH__ e = __ffs(y) - 1; #else e = __builtin_ctz(y); #endif y >>= e; - if(a >= m) - a = reduce_limbs(a); + if (a >= m) a = reduce_limbs(a); a = ((a >> e) | (a << (CONFIG::modulus_bit_count - e))) & m; - if(y == 1) - return a; + if (y == 1) return a; e = a + b; b = a; a = e; @@ -168,20 +148,21 @@ namespace m31 { friend HOST_DEVICE_INLINE MersenneField operator+(MersenneField xs, const MersenneField& ys) { - return MersenneField(MersenneField::reduce_limbs(xs.get_limb() + ys.get_limb())); + return MersenneField(MersenneField::reduce_limbs(xs.get_limb() + ys.get_limb())); } friend HOST_DEVICE_INLINE MersenneField operator-(MersenneField xs, const MersenneField& ys) { - return MersenneField(MersenneField::reduce_limbs(xs.get_limb() + neg_limbs(ys.get_limb()))); + return MersenneField(MersenneField::reduce_limbs(xs.get_limb() + neg_limbs(ys.get_limb()))); } friend HOST_DEVICE_INLINE MersenneField operator*(MersenneField xs, const MersenneField& ys) { - return MersenneField(MersenneField::reduce_mul_limbs((uint64_t)(xs.get_limb()) * ys.get_limb())); + return MersenneField(MersenneField::reduce_mul_limbs((uint64_t)(xs.get_limb()) * ys.get_limb())); } - static constexpr HOST_DEVICE_INLINE Wide mul_wide(const MersenneField& xs, const MersenneField& ys) { + static constexpr HOST_DEVICE_INLINE Wide mul_wide(const MersenneField& xs, const MersenneField& ys) + { return Wide::from_field(xs) * Wide::from_field(ys); } @@ -192,13 +173,13 @@ namespace m31 { return mul_wide(xs, xs); } - static constexpr HOST_DEVICE_INLINE MersenneField sqr(const MersenneField& xs) + static constexpr HOST_DEVICE_INLINE MersenneField sqr(const MersenneField& xs) { return xs * xs; } + + static constexpr HOST_DEVICE_INLINE MersenneField to_montgomery(const MersenneField& xs) { - return xs * xs; + return xs * MersenneField{CONFIG::montgomery_r}; } - static constexpr HOST_DEVICE_INLINE MersenneField to_montgomery(const MersenneField& xs) { return xs * MersenneField{CONFIG::montgomery_r}; } - static constexpr HOST_DEVICE_INLINE MersenneField from_montgomery(const MersenneField& xs) { return xs * MersenneField{CONFIG::montgomery_r_inv}; @@ -236,14 +217,11 @@ namespace m31 { static constexpr storage montgomery_r = {0x00000001}; static constexpr storage montgomery_r_inv = {0x00000001}; - static constexpr storage_array omega = { - {{0x7ffffffe}}}; + static constexpr storage_array omega = {{{0x7ffffffe}}}; - static constexpr storage_array omega_inv = { - {{0x7ffffffe}}}; + static constexpr storage_array omega_inv = {{{0x7ffffffe}}}; - static constexpr storage_array inv = { - {{0x40000000}}}; + static constexpr storage_array inv = {{{0x40000000}}}; // nonresidue to generate the extension field static constexpr uint32_t nonresidue = 11; @@ -260,4 +238,4 @@ namespace m31 { * Extension field of `scalar_t` enabled if `-DEXT_FIELD` env variable is. */ typedef ExtensionField extension_t; -} // namespace babybear +} // namespace m31 diff --git a/icicle/src/ntt/thread_ntt.cu b/icicle/src/ntt/thread_ntt.cu index 8fd2764df..19321a81b 100644 --- a/icicle/src/ntt/thread_ntt.cu +++ b/icicle/src/ntt/thread_ntt.cu @@ -17,11 +17,11 @@ struct stage_metadata { #define STAGE_SIZES_DATA \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {4, 5, 4, 0, 0}, {4, 6, 4, 0, 0}, {5, 5, 5, 0, 0}, {6, 4, 6, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {6, 5, 4, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 4, 5, 6}, {6, 5, 5, 5, 6}, {6, 5, 6, 5, 6}, {6, 6, 5, 6, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST[31][5] = STAGE_SIZES_DATA; __device__ constexpr uint32_t STAGE_SIZES_DEVICE[31][5] = STAGE_SIZES_DATA; @@ -33,11 +33,11 @@ uint32_t constexpr STAGE_PREV_SIZES[31] = {0, 0, 0, 0, 0, 0, 0, 0, 4, 5 #define STAGE_SIZES_DATA_FAST_TW \ { \ {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 0, 0, 0, 0}, {5, 0, 0, 0, 0}, \ - {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ - {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ - {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ - {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ - {6, 6, 6, 6, 6}, \ + {6, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, {4, 4, 0, 0, 0}, {5, 4, 0, 0, 0}, {5, 5, 0, 0, 0}, {6, 5, 0, 0, 0}, \ + {6, 6, 0, 0, 0}, {5, 4, 4, 0, 0}, {5, 4, 5, 0, 0}, {5, 5, 5, 0, 0}, {6, 5, 5, 0, 0}, {6, 5, 6, 0, 0}, \ + {6, 6, 6, 0, 0}, {5, 5, 5, 4, 0}, {5, 5, 5, 5, 0}, {6, 5, 5, 5, 0}, {6, 5, 5, 6, 0}, {6, 6, 6, 5, 0}, \ + {6, 6, 6, 6, 0}, {5, 5, 5, 5, 5}, {6, 5, 5, 5, 5}, {6, 5, 5, 5, 6}, {6, 5, 5, 6, 6}, {6, 6, 6, 5, 6}, \ + {6, 6, 6, 6, 6}, \ } uint32_t constexpr STAGE_SIZES_HOST_FT[31][5] = STAGE_SIZES_DATA_FAST_TW; __device__ uint32_t constexpr STAGE_SIZES_DEVICE_FT[31][5] = STAGE_SIZES_DATA_FAST_TW;