Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FRI folding for M31 #618

Merged
merged 9 commits into from
Sep 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions icicle/include/fri/fri.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once
#ifndef FRI_H
#define FRI_H

#include <cuda_runtime.h>

#include "gpu-utils/device_context.cuh"

namespace fri {

struct FriConfig {
device_context::DeviceContext ctx;
bool are_evals_on_device;
bool are_domain_elements_on_device;
bool are_results_on_device;
bool is_async;
};

/**
* @brief Folds a layer's evaluation into a degree d/2 evaluation using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluation in the current FRI layer.
* @param domain_xs Pointer to a subset of line domain values.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for domain_xs.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
* evaluations by half.
*/
template <typename S, typename E>
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);

/**
* @brief Folds a layer of FRI evaluations from a circle into a line.
*
* This function performs the folding operation in the FRI (Fast Reed-Solomon IOP of Proximity) protocol,
* specifically for evaluations on a circle domain. It takes a layer of evaluations on a circle and folds
* them into a line using the provided folding factor alpha.
*
* @param evals Pointer to the array of evaluations in the current FRI layer, representing points on a circle.
* @param domain_ys Pointer to the array of y-coordinates of the circle points in the domain of the circle that evals
* represents.
* @param alpha The folding factor used in the FRI protocol.
* @param folded_evals Pointer to the array where the folded evaluations (now on a line) will be stored.
* @param n The number of evaluations in the original layer (before folding).
*
* @tparam S The scalar field type used for alpha and domain_ys.
* @tparam E The evaluation type, typically the same as the field element type.
*
* @note The size of the output array 'folded_evals' should be half of 'n', as folding reduces the number of
* evaluations by half.
* @note This function is specifically designed for folding evaluations from a circular domain to a linear domain.
*/

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg);

} // namespace fri

#endif
6 changes: 6 additions & 0 deletions icicle/src/fields/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ endif ()

SET(SUPPORTED_FIELDS_WITHOUT_NTT grumpkin;m31)
SET(SUPPORTED_FIELDS_WITHOUT_POSEIDON2 bls12_381;bls12_377;grumpkin;bw6_761;stark252;m31)
SET(SUPPORTED_FIELDS_WITH_FRI m31)

set(TARGET icicle_field)

Expand Down Expand Up @@ -42,6 +43,11 @@ if (NOT FIELD IN_LIST SUPPORTED_FIELDS_WITHOUT_NTT)
list(APPEND FIELD_SOURCE ${POLYNOMIAL_SOURCE_FILES}) # requires NTT
endif()

if (FIELD IN_LIST SUPPORTED_FIELDS_WITH_FRI)
list(APPEND FIELD_SOURCE ${SRC}/fri/extern.cu)
list(APPEND FIELD_SOURCE ${SRC}/fri/fri.cu)
endif()

add_library(${TARGET} STATIC ${FIELD_SOURCE})
target_include_directories(${TARGET} PUBLIC ${CMAKE_SOURCE_DIR}/include/)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "ingo_field_${FIELD}")
Expand Down
55 changes: 55 additions & 0 deletions icicle/src/fri/extern.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "fields/field_config.cuh"
using namespace field_config;

#include "fri.cu"
#include "utils/utils.h"

namespace fri {
/**
* Extern "C" version of [fold_line](@ref fold_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param line_eval Pointer to the array of evaluations on the line
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_line)(
extension_t* line_eval,
scalar_t* domain_elements,
extension_t alpha,
extension_t* folded_evals,
uint64_t n,
FriConfig& cfg)
{
return fri::fold_line(line_eval, domain_elements, alpha, folded_evals, n, cfg);
};

/**
* Extern "C" version of [fold_circle_into_line](@ref fold_circle_into_line) function with the following values of
* template parameters (where the field is given by `-DFIELD` env variable during build):
* - `E` is the extension field type used for evaluations and alpha
* - `S` is the scalar field type used for domain elements
* @param circle_evals Pointer to the array of evaluations on the circle
* @param domain_elements Pointer to the array of domain elements
* @param alpha The folding factor
* @param folded_line_evals Pointer to the array where folded evaluations will be stored
* @param n The number of evaluations
* @param ctx The device context; if the stream is not 0, then everything is run async
* @return `cudaSuccess` if the execution was successful and an error code otherwise.
*/
extern "C" cudaError_t CONCAT_EXPAND(FIELD, fold_circle_into_line)(
extension_t* circle_evals,
scalar_t* domain_elements,
extension_t alpha,
extension_t* folded_line_evals,
uint64_t n,
FriConfig& cfg)
{
return fri::fold_circle_into_line(circle_evals, domain_elements, alpha, folded_line_evals, n, cfg);
};
} // namespace fri
154 changes: 154 additions & 0 deletions icicle/src/fri/fri.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#include <cuda_runtime.h>

#include "fri/fri.cuh"

#include "fields/field.cuh"
#include "gpu-utils/error_handler.cuh"
#include "gpu-utils/device_context.cuh"

namespace fri {

namespace {
template <typename S, typename E>
__device__ void ibutterfly(E& v0, E& v1, const S& itwid)
{
E tmp = v0;
v0 = tmp + v1;
v1 = (tmp - v1) * itwid;
}

template <typename S, typename E>
__global__ void fold_line_kernel(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n)
{
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f_x = eval[idx]; // even
E f_x_neg = eval[idx + 1]; // odd
S x_domain = domain_xs[idx / 2];
ibutterfly(f_x, f_x_neg, S::inverse(x_domain));
auto folded_eval_idx = idx / 2;
folded_eval[folded_eval_idx] = f_x + alpha * f_x_neg;
}
}

template <typename S, typename E>
__global__ void fold_circle_into_line_kernel(E* eval, S* domain_ys, E alpha, E alpha_sq, E* folded_eval, uint64_t n)
{
unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0 && idx < n) {
E f0_px = eval[idx];
E f1_px = eval[idx + 1];
ibutterfly(f0_px, f1_px, S::inverse(domain_ys[idx / 2]));
E f_prime = f0_px + alpha * f1_px;
auto folded_eval_idx = idx / 2;
folded_eval[folded_eval_idx] = folded_eval[folded_eval_idx] * alpha_sq + f_prime;
}
}
} // namespace

template <typename S, typename E>
cudaError_t fold_line(E* eval, S* domain_xs, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
{
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move line domain evals to device if necessary
E* d_eval;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_eval = eval;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_xs;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n / 2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_xs, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_xs, domain_xs, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_xs = domain_xs;
}

// Allocate folded_eval if pointer is not a device pointer
E* d_folded_eval;
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
} else {
d_folded_eval = folded_eval;
}

uint64_t num_threads = 256;
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
fold_line_kernel<<<num_blocks, num_threads, 0, stream>>>(d_eval, d_domain_xs, alpha, d_folded_eval, n);

// Move folded_eval back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
}
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_xs, stream)); }
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }

jeremyfelder marked this conversation as resolved.
Show resolved Hide resolved
// Sync if stream is default stream
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}

template <typename S, typename E>
cudaError_t fold_circle_into_line(E* eval, S* domain_ys, E alpha, E* folded_eval, uint64_t n, FriConfig& cfg)
{
CHK_INIT_IF_RETURN();

cudaStream_t stream = cfg.ctx.stream;
// Allocate and move circle domain evals to device if necessary
E* d_eval;
if (!cfg.are_evals_on_device) {
auto data_size = sizeof(E) * n;
CHK_IF_RETURN(cudaMallocAsync(&d_eval, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_eval, eval, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_eval = eval;
}

// Allocate and move domain's elements to device if necessary
S* d_domain_ys;
if (!cfg.are_domain_elements_on_device) {
auto data_size = sizeof(S) * n / 2;
CHK_IF_RETURN(cudaMallocAsync(&d_domain_ys, data_size, stream));
CHK_IF_RETURN(cudaMemcpyAsync(d_domain_ys, domain_ys, data_size, cudaMemcpyHostToDevice, stream));
} else {
d_domain_ys = domain_ys;
}

// Allocate folded_evals if pointer is not a device pointer
E* d_folded_eval;
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMallocAsync(&d_folded_eval, sizeof(E) * n / 2, stream));
} else {
d_folded_eval = folded_eval;
}

E alpha_sq = alpha * alpha;
uint64_t num_threads = 256;
uint64_t num_blocks = (n / 2 + num_threads - 1) / num_threads;
fold_circle_into_line_kernel<<<num_blocks, num_threads, 0, stream>>>(
d_eval, d_domain_ys, alpha, alpha_sq, d_folded_eval, n);

// Move folded_evals back to host if requested
if (!cfg.are_results_on_device) {
CHK_IF_RETURN(cudaMemcpyAsync(folded_eval, d_folded_eval, sizeof(E) * n / 2, cudaMemcpyDeviceToHost, stream));
CHK_IF_RETURN(cudaFreeAsync(d_folded_eval, stream));
}
if (!cfg.are_domain_elements_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_domain_ys, stream)); }
if (!cfg.are_evals_on_device) { CHK_IF_RETURN(cudaFreeAsync(d_eval, stream)); }

// Sync if stream is default stream
jeremyfelder marked this conversation as resolved.
Show resolved Hide resolved
if (stream == 0) CHK_IF_RETURN(cudaStreamSynchronize(stream));

return CHK_LAST();
}
} // namespace fri
Loading
Loading