From aa448b307d2e14bcb462b41777f2933e14b59fbd Mon Sep 17 00:00:00 2001 From: peekxc Date: Fri, 16 Feb 2024 10:33:08 -0500 Subject: [PATCH] Initial 0.4.0 release, pending CI --- pyproject.toml | 2 +- src/primate/include/_lanczos/lanczos.h | 312 ++++++++++++++++ .../include/_operators/linear_operator.h | 86 +++++ src/primate/include/_operators/operator.h | 22 ++ .../include/_orthogonalize/orthogonalize.h | 123 +++++++ .../_random_generator/_engines/pcg64.h | 55 +++ .../_random_generator/_engines/split_mix64.h | 38 ++ .../_engines/xoshiro_256_star_star.h | 149 ++++++++ .../_random_generator/random_concepts.h | 51 +++ .../include/_random_generator/rne_engines.h | 17 + .../include/_random_generator/threadedrng64.h | 93 +++++ .../_random_generator/vector_generator.h | 83 +++++ src/primate/include/_trace/hutch.h | 332 ++++++++++++++++++ src/primate/include/eigen_core.h | 20 ++ src/primate/include/omp_support.h | 13 + src/primate/trace.py | 22 +- tests/test_trace.py | 6 +- 17 files changed, 1414 insertions(+), 10 deletions(-) create mode 100644 src/primate/include/_lanczos/lanczos.h create mode 100644 src/primate/include/_operators/linear_operator.h create mode 100644 src/primate/include/_operators/operator.h create mode 100644 src/primate/include/_orthogonalize/orthogonalize.h create mode 100644 src/primate/include/_random_generator/_engines/pcg64.h create mode 100644 src/primate/include/_random_generator/_engines/split_mix64.h create mode 100644 src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h create mode 100644 src/primate/include/_random_generator/random_concepts.h create mode 100644 src/primate/include/_random_generator/rne_engines.h create mode 100644 src/primate/include/_random_generator/threadedrng64.h create mode 100644 src/primate/include/_random_generator/vector_generator.h create mode 100644 src/primate/include/_trace/hutch.h create mode 100644 src/primate/include/eigen_core.h create mode 100644 src/primate/include/omp_support.h diff --git a/pyproject.toml b/pyproject.toml index 9f9daef..bed2c97 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ['meson-python', 'wheel', 'ninja', 'pybind11', 'numpy'] # 'pythran-op [project] name = "scikit-primate" -version = "0.3.6" +version = "0.4.0" readme = "README.md" classifiers = [ "Intended Audience :: Science/Research", diff --git a/src/primate/include/_lanczos/lanczos.h b/src/primate/include/_lanczos/lanczos.h new file mode 100644 index 0000000..5e7af63 --- /dev/null +++ b/src/primate/include/_lanczos/lanczos.h @@ -0,0 +1,312 @@ +#ifndef _LANCZOS_LANCZOS_H +#define _LANCZOS_LANCZOS_H + +#include +#include // function +#include // max +#include "_operators/linear_operator.h" // LinearOperator +#include "_orthogonalize/orthogonalize.h" // orth_vector, mod +// #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array + +#include "omp_support.h" // conditionally enables openmp pragmas +#include "eigen_core.h" +#include +#include + +using std::function; + +template< typename T > +using AdjSolver = Eigen::SelfAdjointEigenSolver< T >; + +// Krylov dimension 'deg' should be at least 1 and at most dimension of the operator +// Precondition: None +constexpr int param_deg(const int deg, const std::pair< size_t, size_t > dim){ + return std::max(1, std::min(deg, int(dim.first))); +} + +// Need to allocate at least 2 Lanczos vectors, should never need more than +// Precondition: deg = param_deg(deg) +constexpr int param_ncv(const int ncv, const int deg, const std::pair< size_t, size_t > dim){ + return std::min(std::max(ncv, 2), std::min(deg, int(dim.first))); +} + +// Orth should be strictly less than ncv, as there are only ncv Lanczos vectors in memory +// If negative or larger than the Krylov dimension, orthogonalize against the maximal number of distinct Lanczos vectors +// Precondition: deg = param_deg(deg) and ncv = param_ncv(ncv) +constexpr int param_orth(const int orth, const int deg, const int ncv, const std::pair< size_t, size_t > dim){ + if (orth < 0 || orth > deg){ return std::min(deg, ncv - 1); } + return std::min(orth, ncv - 1); // should only orthogonalize against in-memory Lanczos vectors +} + +// Paige's A27 variant of the Lanczos method +// Computes the first k elements (a,b) := (alpha,beta) of the tridiagonal matrix T(a,b) where T = Q^T A Q +// Precondition: orth < ncv <= deg and ncv >= 2. +template< std::floating_point F, LinearOperator Matrix > +void lanczos_recurrence( + const Matrix& A, // Symmetric linear operator + F* q, // vector to expand the Krylov space K(A, q) + const int deg, // Dimension of the Krylov subspace to capture + const F rtol, // Tolerance of residual error for early-stopping the iteration. + const int orth, // Number of *additional* vectors to orthogonalize against + F* alpha, // Output diagonal elements of T of size A.shape[1]+1 + F* beta, // Output subdiagonal elements of T of size A.shape[1]+1; should be 0; + F* V, // Output matrix for Lanczos vectors (column-major) + const size_t ncv // Number of Lanczos vectors pre-allocated (must be at least 2) +){ + using VectorF = Eigen::Matrix< F, Dynamic, 1 >; + + // Constants ` + const auto A_shape = A.shape(); + const size_t n = A_shape.first; + const size_t m = A_shape.second; + const F residual_tol = std::sqrt(n) * rtol; + + // Setup views + Eigen::Map< DenseMatrix< F > > Q(V, n, ncv); // Lanczos vectors + Eigen::Map< VectorF > v(q, m, 1); // map initial vector (no-op) + const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); // const view + + // Setup for first iteration + std::array< int, 3 > pos = { int(ncv) - 1, 0, 1 }; // Indices for the recurrence + Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n)); // Ensure previous is 0 + Q.col(0) = v.normalized(); // Load unit-norm v as q0 + beta[0] = 0.0; // Ensure beta_0 is 0 + + for (int j = 0; j < deg; ++j) { + + // Apply the three-term recurrence + auto [p,c,n] = pos; // previous, current, next + A.matvec(Q.col(c).data(), v.data()); // v = A q_c + v -= beta[j] * Q.col(p); // q_n = v - b q_p + alpha[j] = Q.col(c).dot(v); // projection size of < qc, qn > + v -= alpha[j] * Q.col(c); // subtract projected components + + // Re-orthogonalize q_n against previous orth lanczos vectors, up to ncv-1 + if (orth > 0) { + auto qn = Eigen::Ref< Vector< F > >(v); + orth_vector(qn, Q_ref, c, orth, true); + } + + // Early-stop criterion is when K_j(A, v) is near invariant subspace. + beta[j+1] = v.norm(); + if (beta[j+1] < residual_tol || (j+1) == deg) { // additional break prevents overriding qn + break; + } + Q.col(n) = v / beta[j+1]; // normalize such that Q stays orthonormal + + // Cyclic left-rotate to update the working column indices + std::rotate(pos.begin(), pos.begin() + 1, pos.end()); + pos[2] = mod(j+2, ncv); + } +} + +enum weight_method { golub_welsch = 0, fttr = 1 }; + +// NOTE: one idea to reduce memory is to use the matching moment idea +// - Take T - \lambda I for some eigenvalue \lambda; then dim(null(T- \lambda I)) = 1, thus +// - we can solve for (T - \lambda I)x = 0 for x repeatedly, for all \lambda, and take x[0] to be the quadrature weight + +// FTTR +// Uses the Lanczos method to obtain Gaussian quadrature estimates of the spectrum of an arbitrary operator +template< std::floating_point F > +auto lanczos_quadrature( + const F* alpha, // Input diagonal elements of T of size k + const F* beta, // Output subdiagonal elements of T of size k, whose non-zeros start at index 1 + const int k, // Size of input / output elements + AdjSolver< DenseMatrix< F > >& solver, // Solver to use. Assumes workspace has been allocated + F* nodes, // Output nodes of the quadrature + F* weights, // Output weights of the quadrature + const weight_method method = golub_welsch // The method of computing the weights +) -> void { + using VectorF = Eigen::Array< F, Dynamic, 1>; + assert(beta[0] == 0.0); + + Eigen::Map< const VectorF > a(alpha, k); // diagonal elements + Eigen::Map< const VectorF > b(beta+1, k-1); // subdiagonal elements (offset by 1!) + + // Golub-Welsch approach: just compute eigen-decomposition from T using QR steps + if (method == golub_welsch){ + // std::cout << " GW "; + solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::ComputeEigenvectors); + auto theta = static_cast< VectorF >(solver.eigenvalues()); // Rayleigh-Ritz values == nodes + auto tau = static_cast< VectorF >(solver.eigenvectors().row(0)); + tau *= tau; + std::copy(theta.begin(), theta.end(), nodes); + std::copy(tau.begin(), tau.end(), weights); + } else { + // std::cout << " FTTR "; + // Uses the Foward Three Term Recurrence (FTTR) approach + solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::EigenvaluesOnly); + auto theta = static_cast< VectorF >(solver.eigenvalues()); // Rayleigh-Ritz values == nodes + std::copy(theta.begin(), theta.end(), nodes); + + // Compute weights via FTTR + FTTR_weights(theta.data(), alpha, beta, k, weights); + } +}; + +template< std::floating_point F, LinearOperator Matrix > +struct MatrixFunction { + // static const bool owning = false; + using value_type = F; + using VectorF = Eigen::Matrix< F, Dynamic, 1 >; + using ArrayF = Eigen::Array< F, Dynamic, 1 >; + using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; + + // Use non-owning class here rather than copy-constructor to make parallizing easier + // See: https://stackoverflow.com/questions/35770357/storing-const-reference-to-an-object-in-class + // Also see: https://zpjiang.me/2020/01/20/const-reference-vs-pointer/ + const Matrix& op; + // const Matrix op; + + // Fields + // std::function< F(F) > f; + std::function< void(F*, const size_t) > f; + bool native_f = false; + const int deg; + const int ncv; + F rtol; + int orth; + weight_method wgt_method; + std::function< void(F*, const size_t) > transform; + + MatrixFunction(const Matrix& A, const std::function< void(F*, const size_t) > fun, int lanczos_degree, F lanczos_rtol, int _orth, int _ncv, bool native, weight_method _method = golub_welsch) + : op(A), f(fun), native_f(native), + deg(param_deg(lanczos_degree, A.shape())), + ncv(param_ncv(_ncv, deg, A.shape())), + rtol(lanczos_rtol), + orth(param_orth(_orth, deg, ncv, A.shape())), + wgt_method(_method) + { + // Pre-allocate all but Q memory needed for Lanczos iterations + alpha = static_cast< ArrayF >(ArrayF::Zero(deg+1)); + beta = static_cast< ArrayF >(ArrayF::Zero(deg+1)); + nodes = static_cast< ArrayF >(ArrayF::Zero(deg)); + weights = static_cast< ArrayF >(ArrayF::Zero(deg)); + solver = EigenSolver(deg); + transform = [](F* v, const size_t N){ return; }; + }; + + MatrixFunction(Matrix&& A, const std::function< void(F*, const size_t) > fun, int lanczos_degree, F lanczos_rtol, int _orth, int _ncv, bool native, weight_method _method = golub_welsch) + : op(std::move(A)), f(fun), native_f(native), + deg(param_deg(lanczos_degree, A.shape())), + ncv(param_ncv(_ncv, deg, A.shape())), + rtol(lanczos_rtol), + orth(param_orth(_orth, deg, ncv, A.shape())), + wgt_method(_method) { + // Pre-allocate all but Q memory needed for Lanczos iterations + alpha = static_cast< ArrayF >(ArrayF::Zero(deg+1)); + beta = static_cast< ArrayF >(ArrayF::Zero(deg+1)); + nodes = static_cast< ArrayF >(ArrayF::Zero(deg)); + weights = static_cast< ArrayF >(ArrayF::Zero(deg)); + solver = EigenSolver(deg); + transform = [](F* v, const size_t N){ return; }; + }; + + // Approximates v |-> f(A)v via a limited degree Lanczos iteration + void matvec(const F* v, F* y) const noexcept { + // if (op == nullptr){ return; } + + // By default, Q is not allocated in constructor, as quad may used less memory + // For all calls after the first matvec(), Eigen promises this is a no-op + // Note we *need* Q to have exactly deg columns for the matvec approx + if (Q.cols() < deg){ Q.resize(op.shape().first, deg); } + + // Inputs / outputs + Eigen::Map< const VectorF > v_map(v, op.shape().second); + Eigen::Map< VectorF > y_map(y, op.shape().first); + + // Lanczos iteration: save v norm + VectorF v_copy = v_map; // save copy of input + transform(v_copy.data(), v_copy.size()); // transform it, if necessary + const F v_scale = v_copy.norm(); // save its norm + + // Apply Lanczos + lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), deg); + + // Note: Maps are used here to offset the pointers; they should be no-ops anyways + Eigen::Map< ArrayF > a(alpha.data(), deg); // diagonal elements + Eigen::Map< ArrayF > b(beta.data()+1, deg-1); // subdiagonal elements (offset by 1!) + solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::ComputeEigenvectors); + + // Apply the spectral function (in-place) to Rayleigh-Ritz values (nodes) + auto theta = static_cast< ArrayF >(solver.eigenvalues()); + // theta.unaryExpr(f); // this doesn't always work for some reason + // std::transform(theta.begin(), theta.end(), theta.begin(), f); + f(theta.data(), theta.size()); + + // The approximation v |-> f(A)v + const auto V = static_cast< DenseMatrix< F > >(solver.eigenvectors()); // maybe dont cast here + auto v_mod = static_cast< ArrayF >(V.row(0).array()); + v_mod *= theta; + y_map = Q * static_cast< VectorF >(V * v_mod.matrix()); // equivalent to Q V diag(sf(theta)) V^T e_1 + y_map.array() *= v_scale; // re-scale + } + + void matmat(const F* X, F* Y, const size_t k) const noexcept { + // if (op == nullptr){ return; } + Eigen::Map< const DenseMatrix< F > > XM(X, op.shape().second, k); + Eigen::Map< DenseMatrix< F > > YM(Y, op.shape().first, k); + for (size_t j = 0; j < k; ++j){ + matvec(XM.col(j).data(), YM.col(j).data()); + } + } + + // Approximates v^T A v + auto quad(const F* v) const noexcept -> F { + // if (op == nullptr){ return; } + + // Only allocate NCV columns as necessary -- no-op after first resizing + if (Q.cols() < ncv){ Q.resize(op.shape().first, ncv); } + + // Save copy of v + its norm + Eigen::Map< const VectorF > v_map(v, op.shape().second); // no-op + VectorF v_copy = v_map; // save copy; needs to be norm-1 for Lanczos + quad to work + transform(v_copy.data(), v_copy.size()); // transform as needed + const F v_scale = v_copy.norm(); + v_copy.normalize(); + + // Execute lanczos method + Golub-Welsch algorithm + lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); + lanczos_quadrature< F >(alpha.data(), beta.data(), deg, solver, nodes.data(), weights.data(), wgt_method); + + // Apply f to the nodes and sum + // std::transform(nodes.begin(), nodes.end(), nodes.begin(), f); + f(nodes.data(), nodes.size()); + + // std::cout << "f(nodes): " << nodes << std::endl; + // std::cout << "quad: " << F((nodes * weights).sum()) << std::endl; + // std::cout << "vscale**2 " << std::pow(v_scale, 2) << std::endl; + return std::pow(v_scale, 2) * (nodes * weights).sum(); + } + + // Returns (rows, columns) + auto shape() const noexcept -> std::pair< size_t, size_t > { + // if (op == nullptr){ return std::make_pair< size_t, size_t >(0, 0); } + return op.shape(); + } + + // private: // Internal state to re-use + mutable DenseMatrix< F > Q; + mutable ArrayF alpha; + mutable ArrayF beta; + mutable ArrayF nodes; + mutable ArrayF weights; + mutable EigenSolver solver; +}; + +// Approximates the action v |-> f(A)v via the Lanczos method +template< std::floating_point F, LinearOperator Matrix > +void matrix_approx( + const Matrix& A, // LinearOperator + std::function< F(F) > sf, // the spectral function + const F* v, // the input vector + const int lanczos_degree, // Polynomial degree of the Krylov expansion + const F lanczos_rtol, // residual tolerance to consider subspace A-invariant + const int orth, // Number of vectors to re-orthogonalize against <= lanczos_degree + F* y // Output vector +){ + MatrixFunction< F, Matrix >(A, sf, lanczos_degree, lanczos_rtol, orth, lanczos_degree).matvec(v, y); +}; + +#endif \ No newline at end of file diff --git a/src/primate/include/_operators/linear_operator.h b/src/primate/include/_operators/linear_operator.h new file mode 100644 index 0000000..c389bc2 --- /dev/null +++ b/src/primate/include/_operators/linear_operator.h @@ -0,0 +1,86 @@ + +#ifndef _OPERATORS_H +#define _OPERATORS_H + +#include +#include +#include +#include + +using std::pair; +using std::vector; + +template +struct TypeString; + +template <> struct TypeString< float > { + static constexpr const char* value = "float32"; +}; + +template <> struct TypeString< double > { + static constexpr const char* value = "float64"; +}; + +// Generalizes concepts from: https://ameli.github.io/imate/generated/imate.Matrix.html#imate-matrix +template < typename T, typename F = typename T::value_type > +concept LinearOperator = requires(T op, const F* input, F* output) { + { op.matvec(input, output) }; // y = A x + { op.shape() } -> std::convertible_to< std::pair< size_t, size_t > >; +}; + +template < typename T, typename F = typename T::value_type > +concept AdjointOperator = requires(T op, const F* input, F* output) { + { op.rmatvec(input, output) }; // y = A^T x +} && LinearOperator< T, F >; + +template < typename T, typename F = typename T::value_type > +concept LinearAdditiveOperator = requires(T op, const F* input, const F alpha, F* output) { + { op.matvec_add(input, alpha, output) }; // y = y + \alpha * Ax +} && LinearOperator< T, F >; + +template < typename T, typename F = typename T::value_type > +concept AdjointAdditiveOperator = requires(T op, const F* input, const F alpha, F* output) { + { op.rmatvec_add(input, alpha, output) }; // y = y + \alpha * A^T x +} && AdjointOperator< T, F >; + +template < typename T, typename F = typename T::value_type > +concept Operator = + LinearOperator< T, F > || + AdjointOperator< T, F > || + LinearAdditiveOperator< T, F > || + AdjointAdditiveOperator< T, F > +; + +template < typename T, typename F = typename T::value_type > +concept QuadOperator = requires(T op, const F* input) { + { op.quad(input) } -> std::convertible_to< F >; // v^T A v ; todo: make optional if matvec available? +}; + +template < typename T, typename F = typename T::value_type > +concept SupportsMatrixMult = requires(T op, const F* input, F* output, const int k) { + { op.matmat(input, output, k) }; +}; + +// Represents the operator (A + tB) for parameter T = { t1, t2, ..., tn } +template < typename T, typename F = typename T::value_type > +concept AffineOperator = requires(T op, F t) { + T::relation_known; // -> std::convertible_to< bool >; + { op.set_parameter(t) }; + // { op.get_num_parameters() } -> std::convertible_to< size_t >; +} && Operator< T, F >; + +// TODO: fix +template < typename T > +concept HasOp = requires(T op) { + requires std::is_member_object_pointer::value; +}; + +// Used for hard-type checking +// From: https://stackoverflow.com/questions/44012938/how-to-tell-if-a-type-is-an-instance-of-a-specific-template-class +// template class> +// struct is_instance : public std::false_type {}; + +// template class U> +// struct is_instance, U> : public std::true_type {}; + +#endif \ No newline at end of file diff --git a/src/primate/include/_operators/operator.h b/src/primate/include/_operators/operator.h new file mode 100644 index 0000000..c536376 --- /dev/null +++ b/src/primate/include/_operators/operator.h @@ -0,0 +1,22 @@ +#ifndef _OPERATOR_H +#define _OPERATOR_H + +#include "linear_operator.h" + +// TODO: Use CRTP to make the code DRY +// template< LinearOperator Matrix > +// struct Operator { + + +// Operator() = delete; +// ~Operator() = default; + +// template< typename Lambda > +// void matvec(){ +// static_cast(this)->matvec(); +// } + +// } + + +#endif \ No newline at end of file diff --git a/src/primate/include/_orthogonalize/orthogonalize.h b/src/primate/include/_orthogonalize/orthogonalize.h new file mode 100644 index 0000000..90301b7 --- /dev/null +++ b/src/primate/include/_orthogonalize/orthogonalize.h @@ -0,0 +1,123 @@ +#ifndef _ORTHOGONALIZE_ORTHOGONALIZE_H +#define _ORTHOGONALIZE_ORTHOGONALIZE_H + +#include // std::floating_point +// #include // std::isnan +#include "eigen_core.h" // DenseMatrix, EigenCore +#include +#include +#include + +// Emulate python modulus behavior since C++ '%' is not a true modulus +constexpr inline auto mod(int a, int b) noexcept -> int { + return (b + (a % b)) % b; +} + +// template< std::floating_point F > +// inline auto orth_poly_weight(const F x, const F mu_rec_sqrt, const F* a, const F* b, F* tbl, const int n) noexcept -> F { +// // const auto mu_rec_sqrt = 1.0 / std::sqrt(mu) +// tbl[0] = mu_rec_sqrt; +// tbl[1] = (x - a[0]) * mu_rec_sqrt / b[1]; +// F w = std::pow(tbl[0], 2) + std::pow(tbl[1], 2); +// for (int i = 2; i < n; ++i){ +// std::cout << i << ": (((" << x << " - " << a[i-1] << ") * " << tbl[i-1] << ") - " << b[i-1] << " * " << tbl[i-2] << ") / " << b[i] << std::endl; +// // tbl[i] = (((x - a[i-1]) * tbl[i-1]) - b[i-1] * tbl[i-2]) / b[i]; +// F s = (x - a[i-1]) / b[i]; +// F t = -b[i-1] / b[i]; +// tbl[i] = s * tbl[i-1] + t * tbl[i-2]; +// w += std::pow(tbl[i], 2); +// } +// return 1.0 / w; +// } + +template< std::floating_point F > +inline void poly(F x, const F mu_sqrt_rec, const F* a, const F* b, F* z, const size_t n) noexcept { + // assert(z.size() == a.size()); + z[0] = mu_sqrt_rec; + z[1] = (x - a[0]) * z[0] / b[1]; + for (size_t i = 2; i < n; ++i) { + // F zi = ((x - a[i-1]) * z[i-1] - b[i-1] * z[i-2]) / b[i]; + // Slightly more numerically stable way of doing the above + F s = (x - a[i-1]) / b[i]; + F t = -b[i-1] / b[i]; + z[i] = s * z[i-1] + t * z[i-2]; + // std::cout << "(" << x << ") " << i << ": " << s << " * " << z[i-1] << " + " << t << " * " << z[i-2]; + // std::cout << " -> " << z[i] << std::endl; + } +} + +template< std::floating_point F > +void FTTR_weights(const F* theta, const F* alpha, const F* beta, const size_t k, F* weights) { + // assert(ew.size() == a.size()); + const auto a = Eigen::Map< const Array< F > >(alpha, k); + const auto b = Eigen::Map< const Array< F > >(beta, k); + const auto ew = Eigen::Map< const Array< F > >(theta, k); + const F mu_0 = ew.abs().sum(); + const F mu_sqrt_rec = 1.0 / std::sqrt(mu_0); + // std::cout << std::fixed << std::showpoint; + // std::cout << std::setprecision(15); + // std::cout << "---- ACTUAL --- ( " << sizeof(F) << ")" << std::endl; + // std::cout << "a: " << a.matrix().transpose() << std::endl; + // std::cout << "b: " << b.matrix().transpose() << std::endl; + // std::cout << "ew: " << ew.matrix().transpose() << std::endl; + // std::cout << "mu_0: " << mu_0 << std::endl; + Array< F > p(a.size()); + for (size_t i = 0; i < k; ++i){ + poly(theta[i], mu_sqrt_rec, a.data(), b.data(), p.data(), a.size()); + F weight = 1.0 / p.square().sum(); + weights[i] = weight / mu_0; + // std::cout << i << ": (x: " << theta[i] << ", w: " << weight << ") p: " << p.matrix().transpose() << std::endl; + } +} + + +// Orthogonalizes v with respect to columns in U via modified gram schmidt +// Cyclically projects v onto the columns U[:,i:(i+p)] = u_i, u_{i+1}, ..., u_{i+p}, removing from v the components +// of the vector projections. If any index i, ..., i+p exceeds the number of columns of U, the indices are cycled. +// Both near-zero projections and columns of U with near-zero norm are ignored to avoid collapsing v to the trivial vector. +// Eigen::Ref use based on: https://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class +template< std::floating_point F > +void orth_vector( + Ref< Vector< F > > v, // input/output vector + const Ref< const DenseMatrix< F > >& U, // matrix of vectors to project onto + const int start_idx, // starting column index + const int p, // number of projections + const bool reverse = false // whether to cycle through the columns of U backwards +) { + const int n = (int) U.rows(); + const int m = (int) U.cols(); + const auto tol = 2 * std::numeric_limits< F >::epsilon() * std::sqrt(n); // numerical tolerance for orthogonality + + // Successively subtracting the projection of v onto the first p columns starting at start_idx + // If projection or the target vector is near-zero, ignore and continue, as numerical orthogonality is already met + const int diff = reverse ? -1 : 1; + for (int i = mod(start_idx, m), c = 0; c < p; ++c, i = mod(i + diff, m)){ + const auto u_norm = U.col(i).squaredNorm(); // norm of u_i + const auto s_proj = v.dot(U.col(i)); // < v, u_i > + // should protect against nan and 0 vectors, even is isnan(u_norm) is true + if (u_norm > tol && std::abs(s_proj) > tol){ + v -= (s_proj / u_norm) * U.col(i); + } + } +} + + +// Modified Gram-Schmidt in-place +// G. W. Stewart, "Matrix Algorithms, Volume 1", SIAM, 1998. +template< std::floating_point F > +void modified_gram_schmidt(Eigen::Ref< DenseMatrix< F > > U, [[maybe_unused]] const int s = 0){ + // const int n = static_cast< const int >(U.rows()); + const int m = static_cast< const int >(U.cols()); + // auto i = mod(s, m); + auto R = DenseMatrix< F >(m, m); + for (int k = 0; k < m; ++k){ + for (int i = 0; i < k; ++i){ + R(i,k) = U.col(i).dot(U.col(k)); + U.col(k) -= R(i,k) * U.col(i); + } + R(k,k) = U.col(k).norm(); + U.col(k) /= R(k,k); + } +} + +#endif \ No newline at end of file diff --git a/src/primate/include/_random_generator/_engines/pcg64.h b/src/primate/include/_random_generator/_engines/pcg64.h new file mode 100644 index 0000000..d4f5e6e --- /dev/null +++ b/src/primate/include/_random_generator/_engines/pcg64.h @@ -0,0 +1,55 @@ +// PCG64.h +// Adapted from C-implementation: (c) 2014 M.E. O'Neill / pcg-random.org +// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website) +#ifndef _RANDOM_GENERATOR_PCG64_H_ +#define _RANDOM_GENERATOR_PCG64_H_ + +#include // uint64_t +#include // assert +#include // std::time +#include // std::seed_seq +#include // array + +struct Pcg64 { + using result_type = uint64_t; + static constexpr size_t state_size = 2; // size of the entropy/state size, in units of 32-bits + uint64_t state; + uint64_t inc; + + Pcg64(){ + uint64_t seed = static_cast(std::time(0)) + static_cast(std::clock()); + state = (seed << 32) | seed; + inc = (state << 1u) | 1u; + }; + + void seed(std::seed_seq& seed_s) { + std::array< std::uint32_t, 2 > seeds = { 0, 0 }; + seed_s.generate(seeds.begin(), seeds.end()); + state = static_cast< std::uint64_t >(seeds[0]); + state = state << 32; + state = state | static_cast< std::uint64_t >(seeds[1]); + inc = (state << 1u) | 1u; + }; + + uint32_t _generate32() noexcept { + uint64_t oldstate = state; + // Advance internal state + state = oldstate * 6364136223846793005ULL + (inc|1); + // Calculate output function (XSH RR), uses old state for max ILP + uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; + uint32_t rot = oldstate >> 59u; + return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); + }; + + uint64_t operator()() noexcept { + uint64_t z = _generate32(); + z = z << 32; + z = z | static_cast< std::uint64_t >(_generate32()); + return z; + } + + static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); } + static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); } +}; + +#endif // _RANDOM_GENERATOR_PCG64_H_ diff --git a/src/primate/include/_random_generator/_engines/split_mix64.h b/src/primate/include/_random_generator/_engines/split_mix64.h new file mode 100644 index 0000000..78eb7bd --- /dev/null +++ b/src/primate/include/_random_generator/_engines/split_mix64.h @@ -0,0 +1,38 @@ +#ifndef _RANDOM_GENERATOR_SPLIT_MIX_64_H_ +#define _RANDOM_GENERATOR_SPLIT_MIX_64_H_ + +#include // uint64_t +#include // assert +#include // std::time +#include // std::seed_seq +#include // array + + +// From: https://prng.di.unimi.it/splitmix64.c +struct SplitMix64 { + // static constexpr typename result_type = uint64_t; + using result_type = uint64_t; + static constexpr size_t state_size = 2; // size of the entropy/state size, in units of 32-bits + uint64_t state; + SplitMix64(){ + uint64_t seed = static_cast(std::time(0)) + static_cast(std::clock()); + state = (seed << 32) | seed; + }; + void seed(std::seed_seq& state_) { + std::array< std::uint32_t, 2 > seeds = { 0, 0 }; + state_.generate(seeds.begin(), seeds.end()); + state = static_cast< std::uint64_t >(seeds[0]); + state = state << 32; + state = state | static_cast< std::uint64_t >(seeds[1]); + }; + uint64_t operator()() noexcept { + uint64_t z = (state += 0x9e3779b97f4a7c15); + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + return z ^ (z >> 31); + }; + static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); } + static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); } +}; + +#endif // _RANDOM_GENERATOR_SPLIT_MIX_64_H_ diff --git a/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h b/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h new file mode 100644 index 0000000..e0acb78 --- /dev/null +++ b/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h @@ -0,0 +1,149 @@ +/* + * SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli , Matt Piekenbrock + * SPDX-License-Identifier: BSD-3-Clause + * SPDX-FileType: SOURCE + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the license found in the LICENSE.txt file in the root + * directory of this source tree. + */ + + +#ifndef _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_ +#define _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_ + + +#include +#include // uint64_t, UINT64_C +#include // NULL +#include "./split_mix64.h" // SplitMix64 + +// stdint.h in old compilers (e.g. gcc 4.4.7) does not declare UINT64_C macro. +#ifndef UINT64_C + #define UINT64_C(c) static_cast(c) +#endif + +// ===================== +// Xoshiro 256 Star Star +// ===================== + +/// \class Xoshiro256StarStar +/// +/// \brief Pseudo-random integer generator. This class generates 64-bit +/// integer using Xoshiro256** algorithm. +/// +/// \details The Xoshiro256** algorithm has 256-bit state space, and passes all +/// statistical tests, including the BigCrush. The state of this class +/// is initialized using \c SplitMix64 random generator. +/// +/// A very similar method to Xoshiro256** is Xoshiro256++ which has +/// the very same properties and speed as the Xoshiro256**. An +/// alternative method is Xoshiro256+, which is 15% faster, but it +/// suffers linear dependency of the lower 4 bits. It is usually used +/// for generating floating numbers using the upper 53 bits and +/// discard the lower bits. +/// +/// The Xoshiro256** algorithm is develped by David Blackman and +/// Sebastiano Vigna (2018) and the source code can be found at: +/// https://prng.di.unimi.it/xoshiro256starstar.c +/// +/// \sa SplitMix64 + + +struct Xoshiro256StarStar{ + using result_type = uint64_t; + static constexpr size_t state_size = 8; + static inline uint64_t rotation_left( const uint64_t x, int k){ + return (x << k) | (x >> (64 - k)); + }; + std::array< uint64_t, 4 > state; + + Xoshiro256StarStar(){ + SplitMix64 split_mix_64; + state = { split_mix_64(), split_mix_64(), split_mix_64(), split_mix_64() }; + }; + void seed(std::seed_seq& S){ + std::array< std::uint32_t, 8 > seeds; + S.generate(seeds.begin(), seeds.end()); + state[0] = (static_cast< std::uint64_t >(seeds[0]) << 32) | seeds[1]; + state[1] = (static_cast< std::uint64_t >(seeds[2]) << 32) | seeds[3]; + state[2] = (static_cast< std::uint64_t >(seeds[4]) << 32) | seeds[5]; + state[3] = (static_cast< std::uint64_t >(seeds[6]) << 32) | seeds[7]; + }; + uint64_t operator()(){ + const uint64_t result = rotation_left(state[1] * 5, 7) * 9; + const uint64_t t = state[1] << 17; + state[2] ^= state[0]; + state[3] ^= state[1]; + state[1] ^= state[2]; + state[0] ^= state[3]; + state[2] ^= t; + state[3] = rotation_left(state[3], 45); + return result; + }; + + void jump(){ + static const uint64_t JUMP[] = { + 0x180ec6d33cfd0aba, + 0xd5a61266f0c9392c, + 0xa9582618e03fc9aa, + 0x39abdc4529b1661c + }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + uint64_t s2 = 0; + uint64_t s3 = 0; + + for (unsigned int i = 0; i < sizeof(JUMP) / sizeof(*JUMP); ++i) { + for (int b = 0; b < 64; ++b) { + if (JUMP[i] & UINT64_C(1) << b) { + s0 ^= state[0]; + s1 ^= state[1]; + s2 ^= state[2]; + s3 ^= state[3]; + } + this->operator()(); + } + } + + state[0] = s0; + state[1] = s1; + state[2] = s2; + state[3] = s3; + }; + void long_jump(){ + static const uint64_t LONG_JUMP[] = { + 0x76e15d3efefdcbbf, + 0xc5004e441c522fb3, + 0x77710069854ee241, + 0x39109bb02acbe635 + }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + uint64_t s2 = 0; + uint64_t s3 = 0; + + for (unsigned int i = 0; i < sizeof(LONG_JUMP) / sizeof(*LONG_JUMP); ++i) { + for (int b = 0; b < 64; ++b) { + if (LONG_JUMP[i] & UINT64_C(1) << b) { + s0 ^= state[0]; + s1 ^= state[1]; + s2 ^= state[2]; + s3 ^= state[3]; + } + this->operator()(); + } + } + state[0] = s0; + state[1] = s1; + state[2] = s2; + state[3] = s3; + }; + static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); } + static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); } +}; + + +#endif // _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_ diff --git a/src/primate/include/_random_generator/random_concepts.h b/src/primate/include/_random_generator/random_concepts.h new file mode 100644 index 0000000..faf959e --- /dev/null +++ b/src/primate/include/_random_generator/random_concepts.h @@ -0,0 +1,51 @@ +#ifndef _RANDOM_GENERATOR_RANDOM_CONCEPTS_H +#define _RANDOM_GENERATOR_RANDOM_CONCEPTS_H + +#include +#include // SeedSequence + +// std::initializer_list< std::uint32_t > +template < typename T > +concept LightRandom64Engine = requires(T rne, std::seed_seq& S) { + { rne() } -> std::same_as< std::uint64_t >; // () yields an unsigned 64-bit integer + { rne.seed(S) }; // state is seedeable w/ pointer of integers +} && std::default_initializable< T > && std::uniform_random_bit_generator< T >; + +template < typename T > +concept Stateful64Engine = requires(T rne) { + T::state_size > 0; // has a known positive state size, in terms of number 32 unsigned integers +} && LightRandom64Engine< T >; + +// See: https://stackoverflow.com/questions/39288595/why-not-just-use-random-device/ +template < typename T > +concept ThreadSafeRBG = requires(T rbg, int tid) { + { rbg.initialize(tid) }; + { rbg.next(tid) } -> std::convertible_to< std::uint_fast64_t >; +}; + +struct Random64EngineConcept { + using result_type = uint64_t; + // virtual const void seed(int) const = 0; + virtual ~Random64EngineConcept() = default; + virtual void seed(std::seed_seq&) = 0; + virtual uint64_t operator()() = 0; + virtual size_t state_size() const = 0; + static constexpr uint64_t min() { return std::numeric_limits< uint64_t >::min(); } + static constexpr uint64_t max() { return std::numeric_limits< uint64_t >::max(); } +}; + +template < LightRandom64Engine T > +struct Random64Engine : public Random64EngineConcept { + using result_type = uint64_t; + T rng; + Random64Engine() : rng(T()) { } + void seed(std::seed_seq& S) override { rng.seed(S); }; + uint64_t operator()() override { return rng(); } + size_t state_size() const override { + if constexpr (Stateful64Engine< T >){ return std::max(T::state_size, size_t(1)); } + return 1; + } + // T* get_rng(){ return &rng; } +}; + +#endif \ No newline at end of file diff --git a/src/primate/include/_random_generator/rne_engines.h b/src/primate/include/_random_generator/rne_engines.h new file mode 100644 index 0000000..689ab3a --- /dev/null +++ b/src/primate/include/_random_generator/rne_engines.h @@ -0,0 +1,17 @@ +#ifndef _RANDOM_GENERATOR_RNE_ENGINES_H +#define _RANDOM_GENERATOR_RNE_ENGINES_H + +#include // mt +// #include "_engines/pcg_random.h" // pcg64 -- this adds a non-trivial amount to compile time +#include "_engines/pcg64.h" +#include "_engines/split_mix64.h" // SplitMix64 +#include "_engines/xoshiro_256_star_star.h" // Xoshiro256 + +// -- REMOVED due to seeding and other issues +// From: https://www.pcg-random.org/posts/cpp-seeding-surprises.html +// This is technically a biased generator, but with 64-bits of entropy it may be ok +// for many applications and it should be blazing fast besides +// using knuth_lcg = std::linear_congruential_engine< uint64_t, 6364136223846793005U, 1442695040888963407U, 0U>; + + +#endif \ No newline at end of file diff --git a/src/primate/include/_random_generator/threadedrng64.h b/src/primate/include/_random_generator/threadedrng64.h new file mode 100644 index 0000000..03bac25 --- /dev/null +++ b/src/primate/include/_random_generator/threadedrng64.h @@ -0,0 +1,93 @@ +#ifndef _RANDOM_GENERATOR_THREADEDRNG64_H_ +#define _RANDOM_GENERATOR_THREADEDRNG64_H_ + +#include // uint64_t +#include // assert +#include // NULL +#include // uniform_random_bit_generator, mt19937_64 +#include // std::function +#include "random_concepts.h" // LightRandom64Engine, Random64Engine +#include "rne_engines.h" // all the engines + +enum RbEngine { sx = 0, xs = 1, pcg = 2, mt = 3 }; + +// Thread-safe random number generator +// This class constructs n_thread copies of a given *random number engine* type (a state machine with a transition + output function) using +// different seed sequences (seed_seq's) generated from the std::random_device, such that one can safely call ThreadedRNG64.next(tid) with the +// give thread id (tid) and obtain a uniformly random unsigned integer with at least 64-bits. +// NOTE: RandomNumberEngine == UniformRandomBitGenerator + has .seed(), default constructible, and other things +// template< LightRandom64Engine RNE = std::mt19937_64 > +struct ThreadedRNG64 { + static constexpr size_t num_bits = 64; + int num_threads; + const RbEngine engine_id; + std::vector< Random64EngineConcept* > generators; + ThreadedRNG64(int engine = 0) : engine_id(static_cast< RbEngine >(engine)) { + // std::uniform_random_bit_generator RBG = std::random_device; + int num_threads_ = 1; + initialize(num_threads_); + }; + explicit ThreadedRNG64(int num_threads_, int engine, int seed = -1) : engine_id(static_cast< RbEngine >(engine)) { + initialize(num_threads_, seed); + }; + ~ThreadedRNG64(){ + for (int i = 0; i < num_threads; ++i){ + delete generators[i]; // todo: see if this is causing heap corruption + } + } + auto next(int thread_id) -> std::uint64_t { + return generators[thread_id]->operator()(); + } + + void initialize(int num_threads_, int seed = -1){ + // assert(num_threads_ > 0); + if (num_threads_ == 0){ return; } + num_threads = num_threads_; + generators = std::vector< Random64EngineConcept* >(num_threads, nullptr); + + // Make new generators + for (size_t i = 0; i < generators.size(); ++i){ + switch(engine_id){ + case sx: + static_assert(std::uniform_random_bit_generator< Random64Engine< SplitMix64 > >, "Wrapper RNG engine constraints not met"); + generators[i] = new Random64Engine< SplitMix64 >(); + break; + case xs: + static_assert(std::uniform_random_bit_generator< Random64Engine< Xoshiro256StarStar > >, "Wrapper RNG engine constraints not met"); + generators[i] = new Random64Engine< Xoshiro256StarStar >(); + break; + case pcg: { + static_assert(std::uniform_random_bit_generator< Random64Engine< Pcg64 > >, "Wrapper RNG engine constraints not met"); + generators[i] = new Random64Engine< Pcg64 >(); + break; + } + case mt: + generators[i] = new Random64Engine< std::mt19937_64 >(); + break; + } + } + + // Seeds generators with sources of entropy from a RBG (e.g. random_device) + // This seeds the entire state vector of the corresponding RNE's state size / entropy source using rd + auto rdev = std::random_device(); + auto mt = std::mt19937(seed); + std::function< std::uint_fast32_t() > rd; + if (seed < 0){ + rd = [&rdev](){ return rdev(); }; + } else { + rd = [&mt](){ return mt(); }; + } + + // Saturate the full state of the generators with entropy + for (int i = 0; i < num_threads; ++i) { + const auto ssize = generators[i]->state_size(); + std::vector< uint32_t > seed_data(ssize, 0); + std::generate_n(seed_data.begin(), ssize, rd); // generate evenly-distributed 32-bit seeds + std::seed_seq seed_gen(std::begin(seed_data), std::end(seed_data)); + generators[i]->seed(seed_gen); + } + }; +}; + + +#endif // _RANDOM_GENERATOR_THREADEDRNG64_H_ diff --git a/src/primate/include/_random_generator/vector_generator.h b/src/primate/include/_random_generator/vector_generator.h new file mode 100644 index 0000000..392d957 --- /dev/null +++ b/src/primate/include/_random_generator/vector_generator.h @@ -0,0 +1,83 @@ +#ifndef _RANDOM_GENERATOR_RANDOM_ARRAY_GENERATOR_H_ +#define _RANDOM_GENERATOR_RANDOM_ARRAY_GENERATOR_H_ + +#include // uniform_random_bit_generator, normal_distribution +#include // uint64_t +#include // isnan +#include // bitset +#include "omp_support.h" // conditionally enables openmp pragmas +#include "random_concepts.h" // ThreadSafeRBG +// #include "threadedrng64.h" +// #include "rne_engines.h" // all the engines + +using IndexType = unsigned int; +using LongIndexType = unsigned long; +static constexpr long num_bits = 64; + +enum Distribution { rademacher = 0, normal = 1, rayleigh = 2 }; + +// SIMD-vectorized rademacher vector generation; makes N // 64 calls to random bit generator +template< std::floating_point F, ThreadSafeRBG RBG > +void generate_rademacher(const long n, RBG& random_bit_generator, const int thread_id, F* array, F& arr_norm){ + const auto N = static_cast< size_t >(n / RBG::num_bits); + for (size_t i = 0; i < N; ++i) { + std::bitset< RBG::num_bits > ubits { random_bit_generator.next(thread_id) }; + #pragma omp simd + for (size_t j = 0; j < RBG::num_bits; ++j) { + array[i*RBG::num_bits + j] = 2 * int(ubits[j]) - 1; + } + } + // This loop should have less than 64 iterations. + std::bitset< RBG::num_bits > ubits { random_bit_generator.next(thread_id) }; + for (size_t j = N * RBG::num_bits, i = 0; j < size_t(n); ++j, ++i){ + array[j] = (2 * int(ubits[i]) - 1); + } + arr_norm = static_cast< F >(std::sqrt(n)); +} + +// Zero mean, unit variance gaussian +template< std::floating_point F, ThreadSafeRBG RBG > +void generate_normal(const unsigned long n, RBG& bit_generator, const int thread_id, F* array, F& arr_norm){ + static std::normal_distribution d { 0.0, 1.0 }; + // const auto N = static_cast< unsigned long >(n / RBG::num_bits); + auto& gen = *bit_generator.generators[thread_id]; + // #pragma omp simd + for (auto i = static_cast< unsigned long >(0); i < n; ++i){ + array[i] = d(gen); + } + // // This loop should have less than num_bits iterations. + // for (auto j = static_cast< unsigned long >(N * RBG::num_bits), i = static_cast< unsigned long >(0); j < n; ++j, ++i){ + // array[j] = d(gen); + // } + + arr_norm = 0.0; + // #pragma omp simd reduction(+:arr_norm) + for (unsigned long i = 0; i < n; ++i){ + arr_norm += std::pow(array[i], 2); + } + arr_norm = std::sqrt(arr_norm); +} + +// Generates an isotropic random vector for a given thread id +template< std::floating_point F, ThreadSafeRBG RBG > +void generate_isotropic(const Distribution dist_id, const long n, RBG& rbg, const int thread_id, F* array, F& arr_norm){ + switch(dist_id){ + case rademacher: + generate_rademacher(n, rbg, thread_id, array, arr_norm); + break; + case normal: + generate_normal(n, rbg, thread_id, array, arr_norm); + break; + case rayleigh: + // generate_rayleigh(n, bit_generator, thread_id, array, arr_norm); + break; + } + + // Ensure the array has unit norm + const F arr_norm_inv = 1.0 / arr_norm; + #pragma omp simd + for (auto i = static_cast< long >(0); i < n; ++i){ + array[i] *= arr_norm_inv; + } +} +#endif \ No newline at end of file diff --git a/src/primate/include/_trace/hutch.h b/src/primate/include/_trace/hutch.h new file mode 100644 index 0000000..4d19f2e --- /dev/null +++ b/src/primate/include/_trace/hutch.h @@ -0,0 +1,332 @@ +#ifndef _TRACE_HUTCH_H +#define _TRACE_HUTCH_H + +#include +#include // function +#include // max +#include // constants, isnan +#include "omp_support.h" // conditionally enables openmp pragmas +// #include "_operators/linear_operator.h" // LinearOperator +// #include "_orthogonalize/orthogonalize.h" // orth_vector, mod +#include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array, Distribution +#include "_lanczos/lanczos.h" +#include "eigen_core.h" +#include +#include +#include + +using us = std::chrono::microseconds; +using dur_seconds = std::chrono::duration< double >; +using hr_clock = std::chrono::high_resolution_clock; + +template < int Iterate = 3 > +double erf_inv(double x) noexcept { + // Strategy: solve f(y) = x - erf(y) = 0 for given x with Newton's method. + // f'(y) = -erf'(y) = -2/sqrt(pi) e^(-y^2) + // Has quadratic convergence, achieving machine precision with ~three iterations. + if constexpr(Iterate == 0){ + // Specialization to get initial estimate; accurate to about 1e-3. + // Based on https://stackoverflow.com/questions/27229371/inverse-error-function-in-c + const double a = std::log((1 - x) * (1 + x)); + const double b = std::fma(0.5, a, 4.120666747961526); + const double c = 6.47272819164 * a; + return std::copysign(std::sqrt(-b + std::sqrt(std::fma(b, b, -c))), x); + } else { + const double x0 = erf_inv< Iterate - 1 >(x); // compile-time recurse + const double fx0 = x - std::erf(x0); + const double pi = std::acos(-1); + double fpx0 = -2.0 / std::sqrt(pi) * std::exp(-x0 * x0); + return x0 - fx0 / fpx0; // = x1 + } +} + +// Work-around to avoid copying for the multi-threaded build +template< LinearOperator Matrix > +auto get_matrix(const Matrix& A){ + // const auto M = is_instance< Matrix, MatrixFunction >{} ? + if constexpr(HasOp< Matrix >){ + return MatrixFunction(A.op, A.f, A.deg, A.rtol, A.orth, A.ncv, A.wgt_method); + } else { + return A; + } +} + +auto get_num_threads(int nt) -> int { + const auto max_threads = omp_get_max_threads(); + nt = nt <= 0 ? max_threads : nt; + return std::max(1, int(nt)); +} + +// std::function< bool(int) > +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda, typename Lambda2 > +void monte_carlo_quad( + const Matrix& A, // Any linear operator supporting .matvec() and .shape() methods + const Lambda& f_cb, // Thread-safe callback function with signature f(int i, F sample, F* q) + const Lambda2& stop_check, // Function to check for convergence or early-stopping (takes no arguments) + const int nv, // Number of sample vectors to generate + const Distribution dist, // Isotropic distribution used to generate random vectors + RBG& rng, // Random bit generator + const int num_threads, // Number of threads used to parallelize the computation + const int seed, // Seed for random number generator for determinism + size_t& wall_time // Wall clock time +){ + using VectorF = Eigen::Matrix< F, Dynamic, 1 >; + // using ArrayF = Eigen::Array< F, Dynamic, 1 >; + + // Constants + const auto A_shape = A.shape(); + const size_t n = A_shape.first; + const size_t m = A_shape.second; + + // Set the number of threads + initialize multi-threaded RNG + const auto nt = get_num_threads(num_threads); + omp_set_num_threads(nt); + rng.initialize(nt, seed); + + // Using square-root of max possible chunk size for parallel schedules + unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1); + + // Monte-Carlo ensemble sampling + int i; + volatile bool stop_flag = false; // early-stop flag for convergence checking + const auto t_start = hr_clock::now(); + #pragma omp parallel shared(A, stop_flag) + { + int tid = omp_get_thread_num(); // thread-id + + auto q_norm = static_cast< F >(0.0); + auto q = static_cast< VectorF >(VectorF::Zero(m)); + + // Parameterize the matrix function + // TODO: this is not a great way to avoid copying matrices / matrix functions, but is needed as + // the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage + const auto M = get_matrix(A); + + #pragma omp for schedule(dynamic, chunk_size) + for (i = 0; i < nv; ++i){ + if (stop_flag){ continue; } + + // Generate isotropic vector (w/ unit norm) + generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm); + + // Apply quadratic form, using quad() if available + F sample = 0.0; + if constexpr (QuadOperator< Matrix >){ + sample = M.quad(q.data()); // x^T A x + // std::cout << "quad output: " << sample << std::endl; + } else { + auto y = static_cast< VectorF >(VectorF::Zero(n)); + A.matvec(q.data(), y.data()); + sample = y.dot(q); + } + + // Run the user-supplied function (parallel section!) + f_cb(i, sample, q.data()); + + // If supplied, check early-stopping condition + #pragma omp critical + { + stop_flag = stop_check(i); + } + } // omp for + } // omp parallel + const auto t_end = hr_clock::now(); + wall_time = duration_cast< us >(dur_seconds(t_end - t_start)).count(); +} + +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > +auto hutch( + const Matrix& A, + RBG& rbg, const int nv, const int dist, const int engine_id, const int seed, + const F atol, const F rtol, + const int num_threads, + const bool use_CLT, + const F* t_scores, + const F z, + F* estimates, + size_t& wall_time +) -> F { + using ArrayF = Eigen::Array< F, Dynamic, 1 >; + + // Save the sample estimates + const auto save_sample = [&estimates](int i, F sample, [[maybe_unused]] F* q){ + estimates[i] = sample; + }; + + // Type-erased function since the call is cheap + // std::function< bool (int) > early_stop; // apparently this will cause problems! + if (atol == 0.0 && rtol == 0.0){ + const auto early_stop = [](int i) -> bool { return false; }; + monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time); + Eigen::Map< ArrayF > est(estimates, nv); + est *= A.shape().first; + // std::cout << "est sum: " << est.sum() << ", nv: " << nv << std::endl; + F mu_est = est.sum() / nv; + return mu_est; + } else if (use_CLT){ + // Parameterize when to stop using either the CLT over the given confidence level or + // This runs in critical section of the SLQ, so we can depend sequential execution (but i will vary!) + // See: https://math.stackexchange.com/questions/102978/incremental-computation-of-standard-deviation + F mu_est = 0.0, vr_est = 0.0; + F mu_pre = 0.0, vr_pre = 0.0; + int n_samples = 0; // number of estimates computed + // const auto z = std::sqrt(2.0) * erf_inv< 3 >(double(0.95)); + const auto early_stop = [&estimates, &mu_est, &vr_est, &mu_pre, &vr_pre, &n_samples, &t_scores, z, atol, rtol](int i) -> bool { + if (std::isnan(estimates[i])){ return false; } + ++n_samples; + const F denom = (1.0 / F(n_samples)); + const F L = n_samples > 2 ? F(n_samples-2) / F(n_samples-1) : 0.0; + mu_est = denom * (estimates[i] + (n_samples - 1) * mu_pre); + mu_pre = n_samples == 1 ? mu_est : mu_pre; + vr_est = L * vr_pre + denom * std::pow(estimates[i] - mu_pre, 2); // update sample variance + mu_pre = mu_est; + vr_pre = vr_est; + if (n_samples < 3){ + return false; + } else { + const auto sd_est = std::sqrt(vr_est); + const auto score = n_samples < 30 ? t_scores[n_samples] : z; + const auto margin_of_error = score * sd_est / std::sqrt(F(n_samples)); // todo: remove sqrt's + // std::cout << "n: " << n << ", mu: " << mu_est << ", ci: [" << mu_est - margin_of_error << ", " << mu_est + margin_of_error << "]"; + // std::cout << "margin/atol: " << margin_of_error << ", " << atol << "\n"; + return margin_of_error <= atol || std::abs(sd_est / mu_est) <= rtol; + } + }; + monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time); + Eigen::Map< ArrayF > est(estimates, nv); + est *= A.shape().first; + return mu_est * A.shape().first; + } else { + // Use traditional iteration checking, akin to scipy.integrate.quadrature + F mu_est = 0.0, mu_pre = 0.0; + int n_samples = 0; + const auto early_stop = [&estimates, &n_samples, &mu_est, &mu_pre, atol, rtol](int ii) -> bool { // &estimates, &n_samples, &mu_est, &mu_pre, atol, rtol + if (std::isnan(estimates[ii])){ return false; } + ++n_samples; + const F denom = (1.0 / F(n_samples)); + mu_est = denom * (estimates[ii] + (n_samples - 1) * mu_pre); + const bool atol_check = std::abs(mu_est - mu_pre) <= atol; + const bool rtol_check = (std::abs(mu_est - mu_pre) / mu_est) <= rtol; + mu_pre = mu_est; + // std::cout << std::fixed << std::showpoint; + // std::cout << std::setprecision(10); + // std::cout << "n: " << n_samples << ", mu: " << mu_est << ", atol: " << std::abs(mu_est - mu_pre) << ", rtol: " << (std::abs(mu_est - mu_pre) / mu_est) << std::endl; + return atol_check || rtol_check; + }; + monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time); + Eigen::Map< ArrayF > est(estimates, nv); + est *= A.shape().first; + return mu_est * A.shape().first; + } + // Don't pull down monte_carlo_quad; the early-stop defs need to be local scope for some reason! +} + + +// Stochastic Lanczos quadrature method +// std::function +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda > +void slq ( + const Matrix& A, // Any linear operator supporting .matvec() and .shape() methods + const Lambda& f, // Thread-safe function with signature f(int i, F* nodes, F* weights) + const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments) + const int nv, // Number of sample vectors to generate + const Distribution dist, // Isotropic distribution used to generate random vectors + RBG& rng, // Random bit generator + const int lanczos_degree, // Polynomial degree of the Krylov expansion + const F lanczos_rtol, // residual tolerance to consider subspace A-invariant + const int orth, // Number of vectors to re-orthogonalize against <= lanczos_degree + const int ncv, // Number of Lanczos vectors to keep in memory (per-thread) + const int num_threads, // Number of threads used to parallelize the computation + const int seed // Seed for random number generator for determinism +){ + using ArrayF = Eigen::Array< F, Dynamic, 1 >; + using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; + if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); } + if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); } + + // Constants + const auto A_shape = A.shape(); + const size_t n = A_shape.first; + const size_t m = A_shape.second; + + // Set the number of threads + initialize multi-threaded RNG + const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads; + omp_set_num_threads(nt); + rng.initialize(nt, seed); + + // Using square-root of max possible chunk size for parallel schedules + unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1); + + // Monte-Carlo ensemble sampling + int i; + volatile bool stop_flag = false; // early-stop flag for convergence checking + #pragma omp parallel shared(stop_flag) + { + int tid = omp_get_thread_num(); // thread-id + + // Pre-allocate memory needed for Lanczos iterations + auto q_norm = static_cast< F >(0.0); + auto q = static_cast< ArrayF >(ArrayF::Zero(m)); + auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv)); + auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); + auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1)); + auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); + auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree)); + auto solver = EigenSolver(lanczos_degree); + + // Run in parallel + #pragma omp for schedule(dynamic, chunk_size) + for (i = 0; i < nv; ++i){ + if (stop_flag){ continue; } + + // Generate isotropic vector (w/ unit norm) + generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm); + + // Perform a lanczos iteration (populates alpha, beta) + lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); + + // Obtain nodes + weights via quadrature algorithm + lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data()); + + // Run the user-supplied function (parallel section!) + f(i, q.data(), Q.data(), nodes.data(), weights.data()); + + // If supplied, check early-stopping condition + #pragma omp critical + { + stop_flag = stop_check(i); + } + } + } +} + +template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > +void sl_quadrature( + const Matrix& mat, + RBG& rbg, const int nv, const int dist, const int engine_id, const int seed, + const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv, + const int num_threads, + F* quad_nw +){ + using VectorF = Eigen::Array< F, Dynamic, 1>; + + // Parameterize the quadrature function + Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2); + const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){ + // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]); + Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1); // no-op + Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1); // no-op + quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v; + quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v; + }; + + // Parameterize when to stop (run in critical section) + constexpr auto early_stop = [](int i) -> bool { + return false; + }; + + // Execute the stochastic Lanczos quadrature + slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed); +} + +#endif \ No newline at end of file diff --git a/src/primate/include/eigen_core.h b/src/primate/include/eigen_core.h new file mode 100644 index 0000000..4482667 --- /dev/null +++ b/src/primate/include/eigen_core.h @@ -0,0 +1,20 @@ +#ifndef __EIGEN_CORE_H +#define __EIGEN_CORE_H + +#include + +using Eigen::Dynamic; +using Eigen::Ref; +using Eigen::MatrixXf; +using Eigen::MatrixXd; + +template< typename F > +using Vector = Eigen::Matrix< F, Dynamic, 1 >; + +template< typename F > +using Array = Eigen::Array< F, Dynamic, 1 >; + +template< typename F > +using DenseMatrix = Eigen::Matrix< F, Dynamic, Dynamic >; + +#endif \ No newline at end of file diff --git a/src/primate/include/omp_support.h b/src/primate/include/omp_support.h new file mode 100644 index 0000000..ffee824 --- /dev/null +++ b/src/primate/include/omp_support.h @@ -0,0 +1,13 @@ +#ifndef _OMP_SUPPORT_H +#define _OMP_SUPPORT_H + +#ifdef OMP_MULTITHREADED + #include // omp_set_num_threads, omp_get_thread_num +#else + #define omp_get_thread_num() 0 + #define omp_set_num_threads(x) 0 + #define omp_get_max_threads() 1 +#endif + + +#endif \ No newline at end of file diff --git a/src/primate/trace.py b/src/primate/trace.py index b454fcc..ef378f2 100644 --- a/src/primate/trace.py +++ b/src/primate/trace.py @@ -272,7 +272,7 @@ def hutchpp( return 0 ## Convert to a matrix function, if not already - A = matrix_function(A, fun=fun) + A = matrix_function(A, fun=fun, **kwargs) if fun is not None else A ## Setup constants verbose, info = kwargs.get('verbose', False), kwargs.get('info', False) @@ -280,7 +280,7 @@ def hutchpp( nb = (N // 3) if nb == "auto" else nb # number of samples to dedicate to deflation m = (N // 3) if maxiter == "auto" else maxiter # residual samples; default rule uses Hutch++ result f_dtype = (A @ np.zeros(A.shape[1])).dtype if not hasattr(A, "dtype") else A.dtype - info_dict = {} + info_dict = { } ## Sketch Y / Q - use numpy for now, but consider parallelizing MGS later WB = np.random.choice([-1.0, +1.0], size=(N, nb)).astype(f_dtype) @@ -296,14 +296,22 @@ def hutchpp( defl_ests = (Q.T @ (A @ Q)).diagonal() tr_defl = np.sum(defl_ests) else: - tr_defl, defl_ests = A.quad_sum(Q) + tr_defl, defl_ests = _trace.quad_sum(A, Q) if fun is None else A.quad_sum(Q) ## Estimate trace of the residual via Girard-Hutchinson on the ## complement of the deflated subspaces, (I - Q @ Q.T)y - A.deflate(Q) - kwargs['info'] = True - kwargs['verbose'] = False - tr_resi, info_dict = hutch(A, **kwargs) + if fun is None or mode == 'full': + ## Full mode == form the full (m x m) matrix and take the diagonal + ## Note memory efficient, but is potentially vectorized, so suitable for relatively small m + WM = np.random.choice([-1.0, +1.0], size=(N, m)).astype(f_dtype) + G = WM - Q @ (Q.T @ WM) + tr_resi = (1 / m) * (G.T @ (A @ G)).trace() + else: + A.deflate(Q) + kwargs['info'] = True + kwargs['verbose'] = False + tr_resi, ID = hutch(A, maxiter=maxiter, **kwargs) + info_dict.update(ID) # if mode == 'full': # ## Full mode == form the full (m x m) matrix and take the diagonal diff --git a/tests/test_trace.py b/tests/test_trace.py index 3505558..e9f65b3 100644 --- a/tests/test_trace.py +++ b/tests/test_trace.py @@ -196,11 +196,11 @@ def test_quad_sum(): n = 100 A = symmetric(n) Q = np.linalg.qr(A)[0] - test_quads = _trace.quad_sum(A, Q) + _, test_quads = _trace.quad_sum(A, Q) true_quads = (Q.T @ (A @ Q)).diagonal() assert np.allclose(test_quads, true_quads) assert np.isclose(np.sum(test_quads), np.sum(true_quads)) - test_quads = _trace.quad_sum(A, Q[:,:10]) + _, test_quads = _trace.quad_sum(A, Q[:,:10]) true_quads = (Q[:,:10].T @ (A @ Q[:,:10])).diagonal() assert np.allclose(test_quads, true_quads) assert np.isclose(np.sum(test_quads), np.sum(true_quads)) @@ -216,6 +216,8 @@ def test_hutch_pp(): test_trace = hutchpp(A, mode="full", seed=1) assert np.isclose(test_trace, true_trace, atol=1.0) + tr_true = np.sum(np.exp(np.linalg.eigh(A)[0])) + hutchpp(A, fun=np.exp, nb=100, mode="reduced", seed=1) # import timeit # timeit.timeit(lambda: hutchpp(A, mode="full"), number=1000) # timeit.timeit(lambda: hutchpp(A, mode="reduced"), number=1000)