From aa448b307d2e14bcb462b41777f2933e14b59fbd Mon Sep 17 00:00:00 2001
From: peekxc <matt.piekenbrock@gmail.com>
Date: Fri, 16 Feb 2024 10:33:08 -0500
Subject: [PATCH] Initial 0.4.0 release, pending CI

---
 pyproject.toml                                |   2 +-
 src/primate/include/_lanczos/lanczos.h        | 312 ++++++++++++++++
 .../include/_operators/linear_operator.h      |  86 +++++
 src/primate/include/_operators/operator.h     |  22 ++
 .../include/_orthogonalize/orthogonalize.h    | 123 +++++++
 .../_random_generator/_engines/pcg64.h        |  55 +++
 .../_random_generator/_engines/split_mix64.h  |  38 ++
 .../_engines/xoshiro_256_star_star.h          | 149 ++++++++
 .../_random_generator/random_concepts.h       |  51 +++
 .../include/_random_generator/rne_engines.h   |  17 +
 .../include/_random_generator/threadedrng64.h |  93 +++++
 .../_random_generator/vector_generator.h      |  83 +++++
 src/primate/include/_trace/hutch.h            | 332 ++++++++++++++++++
 src/primate/include/eigen_core.h              |  20 ++
 src/primate/include/omp_support.h             |  13 +
 src/primate/trace.py                          |  22 +-
 tests/test_trace.py                           |   6 +-
 17 files changed, 1414 insertions(+), 10 deletions(-)
 create mode 100644 src/primate/include/_lanczos/lanczos.h
 create mode 100644 src/primate/include/_operators/linear_operator.h
 create mode 100644 src/primate/include/_operators/operator.h
 create mode 100644 src/primate/include/_orthogonalize/orthogonalize.h
 create mode 100644 src/primate/include/_random_generator/_engines/pcg64.h
 create mode 100644 src/primate/include/_random_generator/_engines/split_mix64.h
 create mode 100644 src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h
 create mode 100644 src/primate/include/_random_generator/random_concepts.h
 create mode 100644 src/primate/include/_random_generator/rne_engines.h
 create mode 100644 src/primate/include/_random_generator/threadedrng64.h
 create mode 100644 src/primate/include/_random_generator/vector_generator.h
 create mode 100644 src/primate/include/_trace/hutch.h
 create mode 100644 src/primate/include/eigen_core.h
 create mode 100644 src/primate/include/omp_support.h
diff --git a/pyproject.toml b/pyproject.toml
index 9f9daef..bed2c97 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = ['meson-python', 'wheel', 'ninja', 'pybind11', 'numpy'] # 'pythran-op
 
 [project]
 name = "scikit-primate"
-version = "0.3.6"
+version = "0.4.0"
 readme = "README.md"
 classifiers = [
 	"Intended Audience :: Science/Research",
diff --git a/src/primate/include/_lanczos/lanczos.h b/src/primate/include/_lanczos/lanczos.h
new file mode 100644
index 0000000..5e7af63
--- /dev/null
+++ b/src/primate/include/_lanczos/lanczos.h
@@ -0,0 +1,312 @@
+#ifndef _LANCZOS_LANCZOS_H
+#define _LANCZOS_LANCZOS_H
+
+#include <concepts> 
+#include <functional> // function
+#include <algorithm>  // max
+#include "_operators/linear_operator.h" // LinearOperator
+#include "_orthogonalize/orthogonalize.h"   // orth_vector, mod
+// #include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array
+
+#include "omp_support.h" // conditionally enables openmp pragmas
+#include "eigen_core.h"
+#include <Eigen/Eigenvalues>
+#include <iostream>
+
+using std::function; 
+
+template< typename T >
+using AdjSolver = Eigen::SelfAdjointEigenSolver< T >;
+
+// Krylov dimension 'deg' should be at least 1 and at most dimension of the operator
+// Precondition: None
+constexpr int param_deg(const int deg, const std::pair< size_t, size_t > dim){
+  return std::max(1, std::min(deg, int(dim.first)));
+} 
+
+// Need to allocate at least 2 Lanczos vectors, should never need more than 
+// Precondition: deg = param_deg(deg)
+constexpr int param_ncv(const int ncv, const int deg, const std::pair< size_t, size_t > dim){
+  return std::min(std::max(ncv, 2), std::min(deg, int(dim.first)));
+} 
+
+// Orth should be strictly less than ncv, as there are only ncv Lanczos vectors in memory
+// If negative or larger than the Krylov dimension, orthogonalize against the maximal number of distinct Lanczos vectors 
+// Precondition: deg = param_deg(deg) and ncv = param_ncv(ncv)
+constexpr int param_orth(const int orth, const int deg, const int ncv, const std::pair< size_t, size_t > dim){
+  if (orth < 0 || orth > deg){ return std::min(deg, ncv - 1); }
+  return std::min(orth, ncv - 1); // should only orthogonalize against in-memory Lanczos vectors
+} 
+
+// Paige's A27 variant of the Lanczos method
+// Computes the first k elements (a,b) := (alpha,beta) of the tridiagonal matrix T(a,b) where T = Q^T A Q
+// Precondition: orth < ncv <= deg and ncv >= 2.
+template< std::floating_point F, LinearOperator Matrix >
+void lanczos_recurrence(
+  const Matrix& A,            // Symmetric linear operator 
+  F* q,                       // vector to expand the Krylov space K(A, q)
+  const int deg,              // Dimension of the Krylov subspace to capture
+  const F rtol,               // Tolerance of residual error for early-stopping the iteration.
+  const int orth,             // Number of *additional* vectors to orthogonalize against 
+  F* alpha,                   // Output diagonal elements of T of size A.shape[1]+1
+  F* beta,                    // Output subdiagonal elements of T of size A.shape[1]+1; should be 0; 
+  F* V,                       // Output matrix for Lanczos vectors (column-major)
+  const size_t ncv            // Number of Lanczos vectors pre-allocated (must be at least 2)
+){
+  using VectorF = Eigen::Matrix< F, Dynamic, 1 >;
+
+  // Constants `
+  const auto A_shape = A.shape();
+  const size_t n = A_shape.first;
+  const size_t m = A_shape.second;
+  const F residual_tol = std::sqrt(n) * rtol;
+
+  // Setup views
+  Eigen::Map< DenseMatrix< F > > Q(V, n, ncv);                // Lanczos vectors 
+  Eigen::Map< VectorF > v(q, m, 1);                           // map initial vector (no-op)
+  const auto Q_ref = Eigen::Ref< const DenseMatrix< F > >(Q); // const view 
+
+  // Setup for first iteration
+  std::array< int, 3 > pos = { int(ncv) - 1, 0, 1 };          // Indices for the recurrence
+  Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n));   // Ensure previous is 0
+  Q.col(0) = v.normalized();                                  // Load unit-norm v as q0
+  beta[0] = 0.0;                                              // Ensure beta_0 is 0
+
+  for (int j = 0; j < deg; ++j) {
+
+    // Apply the three-term recurrence
+    auto [p,c,n] = pos;                   // previous, current, next
+    A.matvec(Q.col(c).data(), v.data());  // v = A q_c
+    v -= beta[j] * Q.col(p);              // q_n = v - b q_p
+    alpha[j] = Q.col(c).dot(v);           // projection size of < qc, qn > 
+    v -= alpha[j] * Q.col(c);             // subtract projected components
+
+    // Re-orthogonalize q_n against previous orth lanczos vectors, up to ncv-1
+    if (orth > 0) {
+      auto qn = Eigen::Ref< Vector< F > >(v);          
+      orth_vector(qn, Q_ref, c, orth, true);
+    }
+
+    // Early-stop criterion is when K_j(A, v) is near invariant subspace.
+    beta[j+1] = v.norm();
+    if (beta[j+1] < residual_tol || (j+1) == deg) { // additional break prevents overriding qn
+      break;
+    }
+    Q.col(n) = v / beta[j+1]; // normalize such that Q stays orthonormal
+
+    // Cyclic left-rotate to update the working column indices
+    std::rotate(pos.begin(), pos.begin() + 1, pos.end());
+    pos[2] = mod(j+2, ncv);
+  }
+}
+
+enum weight_method { golub_welsch = 0, fttr = 1 };
+
+// NOTE: one idea to reduce memory is to use the matching moment idea
+// - Take T - \lambda I for some eigenvalue \lambda; then dim(null(T- \lambda I)) = 1, thus 
+// - we can solve for (T - \lambda I)x = 0 for x repeatedly, for all \lambda, and take x[0] to be the quadrature weight
+
+// FTTR 
+// Uses the Lanczos method to obtain Gaussian quadrature estimates of the spectrum of an arbitrary operator
+template< std::floating_point F >
+auto lanczos_quadrature(
+  const F* alpha,                           // Input diagonal elements of T of size k
+  const F* beta,                            // Output subdiagonal elements of T of size k, whose non-zeros start at index 1
+  const int k,                              // Size of input / output elements 
+  AdjSolver< DenseMatrix< F > >& solver,    // Solver to use. Assumes workspace has been allocated
+  F* nodes,                                 // Output nodes of the quadrature
+  F* weights,                               // Output weights of the quadrature
+  const weight_method method = golub_welsch  // The method of computing the weights 
+) -> void {
+  using VectorF = Eigen::Array< F, Dynamic, 1>;
+  assert(beta[0] == 0.0);
+  
+  Eigen::Map< const VectorF > a(alpha, k);        // diagonal elements
+  Eigen::Map< const VectorF > b(beta+1, k-1);     // subdiagonal elements (offset by 1!)
+
+  // Golub-Welsch approach: just compute eigen-decomposition from T using QR steps
+  if (method == golub_welsch){
+    // std::cout << " GW ";
+    solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::ComputeEigenvectors);
+    auto theta = static_cast< VectorF >(solver.eigenvalues()); // Rayleigh-Ritz values == nodes
+    auto tau = static_cast< VectorF >(solver.eigenvectors().row(0));
+    tau *= tau;
+    std::copy(theta.begin(), theta.end(), nodes);
+    std::copy(tau.begin(), tau.end(), weights);
+  } else {
+    // std::cout << " FTTR ";
+  // Uses the Foward Three Term Recurrence (FTTR) approach 
+    solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::EigenvaluesOnly);
+    auto theta = static_cast< VectorF >(solver.eigenvalues()); // Rayleigh-Ritz values == nodes
+    std::copy(theta.begin(), theta.end(), nodes);
+
+    // Compute weights via FTTR
+    FTTR_weights(theta.data(), alpha, beta, k, weights);
+  }
+};
+
+template< std::floating_point F, LinearOperator Matrix > 
+struct MatrixFunction {
+  // static const bool owning = false;
+  using value_type = F;
+  using VectorF = Eigen::Matrix< F, Dynamic, 1 >;
+  using ArrayF = Eigen::Array< F, Dynamic, 1 >;
+  using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; 
+
+  // Use non-owning class here rather than copy-constructor to make parallizing easier
+  // See: https://stackoverflow.com/questions/35770357/storing-const-reference-to-an-object-in-class
+  // Also see: https://zpjiang.me/2020/01/20/const-reference-vs-pointer/
+  const Matrix& op; 
+  // const Matrix op; 
+
+  // Fields
+  // std::function< F(F) > f; 
+  std::function< void(F*, const size_t) > f;
+  bool native_f = false;
+  const int deg;
+  const int ncv; 
+  F rtol; 
+  int orth;
+  weight_method wgt_method; 
+  std::function< void(F*, const size_t) > transform;
+
+  MatrixFunction(const Matrix& A, const std::function< void(F*, const size_t) > fun, int lanczos_degree, F lanczos_rtol, int _orth, int _ncv, bool native, weight_method _method = golub_welsch) 
+  : op(A), f(fun), native_f(native),
+    deg(param_deg(lanczos_degree, A.shape())), 
+    ncv(param_ncv(_ncv, deg, A.shape())), 
+    rtol(lanczos_rtol), 
+    orth(param_orth(_orth, deg, ncv, A.shape())), 
+    wgt_method(_method) 
+  {
+    // Pre-allocate all but Q memory needed for Lanczos iterations
+    alpha = static_cast< ArrayF >(ArrayF::Zero(deg+1));
+    beta = static_cast< ArrayF >(ArrayF::Zero(deg+1));
+    nodes = static_cast< ArrayF >(ArrayF::Zero(deg));
+    weights = static_cast< ArrayF >(ArrayF::Zero(deg));
+    solver = EigenSolver(deg);
+    transform = [](F* v, const size_t N){ return; };
+  };
+
+  MatrixFunction(Matrix&& A, const std::function< void(F*, const size_t) > fun, int lanczos_degree, F lanczos_rtol, int _orth, int _ncv, bool native, weight_method _method = golub_welsch) 
+  : op(std::move(A)), f(fun), native_f(native),
+    deg(param_deg(lanczos_degree, A.shape())), 
+    ncv(param_ncv(_ncv, deg, A.shape())), 
+    rtol(lanczos_rtol), 
+    orth(param_orth(_orth, deg, ncv, A.shape())), 
+    wgt_method(_method) {
+    // Pre-allocate all but Q memory needed for Lanczos iterations
+    alpha = static_cast< ArrayF >(ArrayF::Zero(deg+1));
+    beta = static_cast< ArrayF >(ArrayF::Zero(deg+1));
+    nodes = static_cast< ArrayF >(ArrayF::Zero(deg));
+    weights = static_cast< ArrayF >(ArrayF::Zero(deg));
+    solver = EigenSolver(deg);
+    transform = [](F* v, const size_t N){ return; };
+  };
+ 
+  // Approximates v |-> f(A)v via a limited degree Lanczos iteration
+  void matvec(const F* v, F* y) const noexcept {
+    // if (op == nullptr){ return; }
+
+    // By default, Q is not allocated in constructor, as quad may used less memory
+    // For all calls after the first matvec(), Eigen promises this is a no-op
+    // Note we *need* Q to have exactly deg columns for the matvec approx
+    if (Q.cols() < deg){ Q.resize(op.shape().first, deg); }
+  
+    // Inputs / outputs 
+    Eigen::Map< const VectorF > v_map(v, op.shape().second);
+    Eigen::Map< VectorF > y_map(y, op.shape().first);
+    
+    // Lanczos iteration: save v norm 
+    VectorF v_copy = v_map;                // save copy of input 
+    transform(v_copy.data(), v_copy.size()); // transform it, if necessary
+    const F v_scale = v_copy.norm();        // save its norm
+
+    // Apply Lanczos
+    lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), deg); 
+    
+    // Note: Maps are used here to offset the pointers; they should be no-ops anyways
+    Eigen::Map< ArrayF > a(alpha.data(), deg);      // diagonal elements
+    Eigen::Map< ArrayF > b(beta.data()+1, deg-1);   // subdiagonal elements (offset by 1!)
+    solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::ComputeEigenvectors);
+
+    // Apply the spectral function (in-place) to Rayleigh-Ritz values (nodes)
+    auto theta = static_cast< ArrayF >(solver.eigenvalues());
+    // theta.unaryExpr(f); // this doesn't always work for some reason
+    // std::transform(theta.begin(), theta.end(), theta.begin(), f);
+    f(theta.data(), theta.size());
+    
+    // The approximation v |-> f(A)v 
+    const auto V = static_cast< DenseMatrix< F > >(solver.eigenvectors()); // maybe dont cast here 
+    auto v_mod = static_cast< ArrayF >(V.row(0).array());
+    v_mod *= theta;
+    y_map = Q * static_cast< VectorF >(V * v_mod.matrix()); // equivalent to Q V diag(sf(theta)) V^T e_1
+    y_map.array() *= v_scale; // re-scale
+  }   
+
+  void matmat(const F* X, F* Y, const size_t k) const noexcept {
+    // if (op == nullptr){ return; }
+    Eigen::Map< const DenseMatrix< F > > XM(X, op.shape().second, k);
+    Eigen::Map< DenseMatrix< F > > YM(Y, op.shape().first, k);
+    for (size_t j = 0; j < k; ++j){
+      matvec(XM.col(j).data(), YM.col(j).data());
+    }
+  }
+
+  // Approximates v^T A v
+  auto quad(const F* v) const noexcept -> F {
+    // if (op == nullptr){ return; }
+
+    // Only allocate NCV columns as necessary -- no-op after first resizing
+    if (Q.cols() < ncv){ Q.resize(op.shape().first, ncv); }
+
+    // Save copy of v + its norm 
+    Eigen::Map< const VectorF > v_map(v, op.shape().second); // no-op 
+    VectorF v_copy = v_map;                  // save copy; needs to be norm-1 for Lanczos + quad to work
+    transform(v_copy.data(), v_copy.size()); // transform as needed
+    const F v_scale = v_copy.norm(); 
+    v_copy.normalize();
+
+    // Execute lanczos method + Golub-Welsch algorithm
+    lanczos_recurrence< F >(op, v_copy.data(), deg, rtol, orth, alpha.data(), beta.data(), Q.data(), ncv);   
+    lanczos_quadrature< F >(alpha.data(), beta.data(), deg, solver, nodes.data(), weights.data(), wgt_method);
+  
+    // Apply f to the nodes and sum
+    // std::transform(nodes.begin(), nodes.end(), nodes.begin(), f);
+    f(nodes.data(), nodes.size());
+
+    // std::cout << "f(nodes): " << nodes << std::endl;
+    // std::cout << "quad: " << F((nodes * weights).sum()) << std::endl; 
+    // std::cout << "vscale**2 " << std::pow(v_scale, 2) << std::endl; 
+    return std::pow(v_scale, 2) * (nodes * weights).sum();
+  }
+
+  // Returns (rows, columns)
+  auto shape() const noexcept -> std::pair< size_t, size_t > {
+    // if (op == nullptr){ return std::make_pair< size_t, size_t >(0, 0); }
+    return op.shape();
+  }
+
+  // private: // Internal state to re-use
+  mutable DenseMatrix< F > Q;
+  mutable ArrayF alpha;
+  mutable ArrayF beta;
+  mutable ArrayF nodes;
+  mutable ArrayF weights;
+  mutable EigenSolver solver;  
+};
+
+// Approximates the action v |-> f(A)v via the Lanczos method
+template< std::floating_point F, LinearOperator Matrix > 
+void matrix_approx(
+  const Matrix& A,                            // LinearOperator 
+  std::function< F(F) > sf,             // the spectral function 
+  const F* v,                                 // the input vector
+  const int lanczos_degree,                   // Polynomial degree of the Krylov expansion
+  const F lanczos_rtol,                       // residual tolerance to consider subspace A-invariant
+  const int orth,                             // Number of vectors to re-orthogonalize against <= lanczos_degree
+  F* y                                        // Output vector
+){
+  MatrixFunction< F, Matrix >(A, sf, lanczos_degree, lanczos_rtol, orth, lanczos_degree).matvec(v, y);
+};
+
+#endif 
\ No newline at end of file
diff --git a/src/primate/include/_operators/linear_operator.h b/src/primate/include/_operators/linear_operator.h
new file mode 100644
index 0000000..c389bc2
--- /dev/null
+++ b/src/primate/include/_operators/linear_operator.h
@@ -0,0 +1,86 @@
+
+#ifndef _OPERATORS_H
+#define _OPERATORS_H
+
+#include <type_traits>
+#include <concepts>
+#include <algorithm>
+#include <functional>
+
+using std::pair; 
+using std::vector; 
+
+template <typename T>
+struct TypeString;
+
+template <> struct TypeString< float > {
+  static constexpr const char* value = "float32";
+};
+
+template <> struct TypeString< double > {
+  static constexpr const char* value = "float64";
+};
+
+// Generalizes concepts from: https://ameli.github.io/imate/generated/imate.Matrix.html#imate-matrix
+template < typename T, typename F = typename T::value_type >
+concept LinearOperator = requires(T op, const F* input, F* output) {
+  { op.matvec(input, output) }; // y = A x 
+  { op.shape() } -> std::convertible_to< std::pair< size_t, size_t > >;
+};
+
+template < typename T, typename F = typename T::value_type >
+concept AdjointOperator = requires(T op, const F* input, F* output) {
+  { op.rmatvec(input, output) }; // y = A^T x 
+} && LinearOperator< T, F >;
+
+template < typename T, typename F = typename T::value_type >
+concept LinearAdditiveOperator = requires(T op, const F* input, const F alpha, F* output) {
+  { op.matvec_add(input, alpha, output) }; // y = y + \alpha * Ax
+} && LinearOperator< T, F >;
+
+template < typename T, typename F = typename T::value_type >
+concept AdjointAdditiveOperator = requires(T op, const F* input, const F alpha, F* output) {
+  { op.rmatvec_add(input, alpha, output) }; // y = y + \alpha * A^T x
+} && AdjointOperator< T, F >;
+
+template < typename T, typename F = typename T::value_type >
+concept Operator = 
+  LinearOperator< T, F > || 
+  AdjointOperator< T, F > || 
+  LinearAdditiveOperator< T, F > ||
+  AdjointAdditiveOperator< T, F >
+;
+
+template < typename T, typename F = typename T::value_type >
+concept QuadOperator = requires(T op, const F* input) {
+  { op.quad(input) } -> std::convertible_to< F >; // v^T A v ; todo: make optional if matvec available?
+};
+
+template < typename T, typename F = typename T::value_type >
+concept SupportsMatrixMult = requires(T op, const F* input, F* output, const int k) {
+  { op.matmat(input, output, k) };
+};
+
+// Represents the operator (A + tB) for parameter T = { t1, t2, ..., tn }
+template < typename T, typename F = typename T::value_type >
+concept AffineOperator = requires(T op, F t) {
+  T::relation_known; // -> std::convertible_to< bool >;
+  { op.set_parameter(t) };
+  // { op.get_num_parameters() } -> std::convertible_to< size_t >;
+} && Operator< T, F >;
+
+// TODO: fix 
+template < typename T >
+concept HasOp = requires(T op) {
+  requires std::is_member_object_pointer<decltype(&T::op)>::value;
+};
+
+// Used for hard-type checking
+// From: https://stackoverflow.com/questions/44012938/how-to-tell-if-a-type-is-an-instance-of-a-specific-template-class
+// template <class, template <class, class...> class>
+// struct is_instance : public std::false_type {};
+
+// template <class...Ts, template <class, class...> class U>
+// struct is_instance<U<Ts...>, U> : public std::true_type {};
+
+#endif
\ No newline at end of file
diff --git a/src/primate/include/_operators/operator.h b/src/primate/include/_operators/operator.h
new file mode 100644
index 0000000..c536376
--- /dev/null
+++ b/src/primate/include/_operators/operator.h
@@ -0,0 +1,22 @@
+#ifndef _OPERATOR_H
+#define _OPERATOR_H
+
+#include "linear_operator.h"
+
+// TODO: Use CRTP to make the code DRY
+// template< LinearOperator Matrix >
+// struct Operator {
+  
+  
+//   Operator() = delete;
+//   ~Operator() = default;
+
+//   template< typename Lambda > 
+//   void matvec(){
+//     static_cast<const Matrix*>(this)->matvec();
+//   }
+  
+// }
+
+
+#endif 
\ No newline at end of file
diff --git a/src/primate/include/_orthogonalize/orthogonalize.h b/src/primate/include/_orthogonalize/orthogonalize.h
new file mode 100644
index 0000000..90301b7
--- /dev/null
+++ b/src/primate/include/_orthogonalize/orthogonalize.h
@@ -0,0 +1,123 @@
+#ifndef _ORTHOGONALIZE_ORTHOGONALIZE_H
+#define _ORTHOGONALIZE_ORTHOGONALIZE_H
+
+#include <concepts> // std::floating_point
+// #include <cmath> // std::isnan
+#include "eigen_core.h" // DenseMatrix, EigenCore
+#include <Eigen/QR>
+#include <iomanip>
+#include <iostream>
+
+// Emulate python modulus behavior since C++ '%' is not a true modulus
+constexpr inline auto mod(int a, int b) noexcept -> int {
+  return (b + (a % b)) % b; 
+}
+
+// template< std::floating_point F >
+// inline auto orth_poly_weight(const F x, const F mu_rec_sqrt, const F* a, const F* b, F* tbl, const int n) noexcept -> F { 
+//   // const auto mu_rec_sqrt = 1.0 / std::sqrt(mu)
+//   tbl[0] = mu_rec_sqrt;
+//   tbl[1] = (x - a[0]) * mu_rec_sqrt / b[1];
+//   F w = std::pow(tbl[0], 2) + std::pow(tbl[1], 2);
+//   for (int i = 2; i < n; ++i){ 
+//     std::cout << i << ": (((" << x << " - " << a[i-1] << ") * " << tbl[i-1] << ") - " <<  b[i-1] << " * " <<  tbl[i-2] << ") / " << b[i] << std::endl;
+//     // tbl[i] = (((x - a[i-1]) * tbl[i-1]) - b[i-1] * tbl[i-2]) / b[i];
+//     F s = (x - a[i-1]) / b[i];
+//     F t = -b[i-1] / b[i];
+//     tbl[i] = s * tbl[i-1] + t * tbl[i-2];
+//     w += std::pow(tbl[i], 2);
+//   }
+//   return 1.0 / w;
+// }
+
+template< std::floating_point F >
+inline void poly(F x, const F mu_sqrt_rec, const F* a, const F* b, F* z, const size_t n) noexcept {
+  // assert(z.size() == a.size());
+  z[0] = mu_sqrt_rec;
+  z[1] = (x - a[0]) * z[0] / b[1];
+  for (size_t i = 2; i < n; ++i) {
+    // F zi = ((x - a[i-1]) * z[i-1] - b[i-1] * z[i-2]) / b[i];
+    // Slightly more numerically stable way of doing the above
+    F s = (x - a[i-1]) / b[i];
+    F t = -b[i-1] / b[i];
+    z[i] = s * z[i-1] + t * z[i-2];
+    // std::cout << "(" << x << ") " << i << ": " << s  << " * " << z[i-1] << " + "  << t << " * " << z[i-2];
+    // std::cout << " -> " << z[i] << std::endl;
+  }
+}
+
+template< std::floating_point F >
+void FTTR_weights(const F* theta, const F* alpha, const F* beta, const size_t k, F* weights) {
+  // assert(ew.size() == a.size());
+  const auto a = Eigen::Map< const Array< F > >(alpha, k);
+  const auto b = Eigen::Map< const Array< F > >(beta, k);
+  const auto ew = Eigen::Map< const Array< F > >(theta, k); 
+  const F mu_0 = ew.abs().sum();
+  const F mu_sqrt_rec = 1.0 / std::sqrt(mu_0);
+  // std::cout << std::fixed << std::showpoint;
+  // std::cout << std::setprecision(15);
+  // std::cout << "---- ACTUAL --- ( " << sizeof(F) << ")" << std::endl;
+  // std::cout << "a: " << a.matrix().transpose() << std::endl;
+  // std::cout << "b: " << b.matrix().transpose() << std::endl;
+  // std::cout << "ew: " << ew.matrix().transpose() << std::endl;
+  // std::cout << "mu_0: " << mu_0 << std::endl;
+  Array< F > p(a.size());
+  for (size_t i = 0; i < k; ++i){
+    poly(theta[i], mu_sqrt_rec, a.data(), b.data(), p.data(), a.size());
+    F weight = 1.0 / p.square().sum();
+    weights[i] = weight / mu_0; 
+    // std::cout << i << ": (x: " << theta[i] << ", w: " << weight << ") p: " << p.matrix().transpose() << std::endl;
+  }
+}
+
+
+// Orthogonalizes v with respect to columns in U via modified gram schmidt
+// Cyclically projects v onto the columns U[:,i:(i+p)] = u_i, u_{i+1}, ..., u_{i+p}, removing from v the components 
+// of the vector projections. If any index i, ..., i+p exceeds the number of columns of U, the indices are cycled. 
+// Both near-zero projections and columns of U with near-zero norm are ignored to avoid collapsing v to the trivial vector.
+// Eigen::Ref use based on: https://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class
+template< std::floating_point F >
+void orth_vector(
+  Ref< Vector< F > > v,                   // input/output vector
+  const Ref< const DenseMatrix< F > >& U,    // matrix of vectors to project onto
+  const int start_idx,                       // starting column index
+  const int p,                               // number of projections
+  const bool reverse = false                 // whether to cycle through the columns of U backwards
+) {
+  const int n = (int) U.rows(); 
+  const int m = (int) U.cols(); 
+  const auto tol = 2 * std::numeric_limits< F >::epsilon() * std::sqrt(n); // numerical tolerance for orthogonality
+
+  // Successively subtracting the projection of v onto the first p columns starting at start_idx
+  // If projection or the target vector is near-zero, ignore and continue, as numerical orthogonality is already met
+  const int diff = reverse ? -1 : 1; 
+  for (int i = mod(start_idx, m), c = 0; c < p; ++c, i = mod(i + diff, m)){
+    const auto u_norm = U.col(i).squaredNorm();      // norm of u_i
+    const auto s_proj = v.dot(U.col(i)); // < v, u_i > 
+    // should protect against nan and 0 vectors, even is isnan(u_norm) is true
+    if (u_norm > tol && std::abs(s_proj) > tol){ 
+      v -= (s_proj / u_norm) * U.col(i);
+    }
+  }
+}
+
+
+// Modified Gram-Schmidt in-place
+// G. W. Stewart, "Matrix Algorithms, Volume 1", SIAM, 1998.
+template< std::floating_point F >
+void modified_gram_schmidt(Eigen::Ref< DenseMatrix< F > > U, [[maybe_unused]] const int s = 0){
+  // const int n = static_cast< const int >(U.rows());
+  const int m = static_cast< const int >(U.cols());
+  // auto i = mod(s, m);
+  auto R = DenseMatrix< F >(m, m);
+  for (int k = 0; k < m; ++k){
+    for (int i = 0; i < k; ++i){
+      R(i,k) = U.col(i).dot(U.col(k));
+      U.col(k) -= R(i,k) * U.col(i);
+    }
+    R(k,k) = U.col(k).norm();
+    U.col(k) /= R(k,k);
+  }
+}
+
+#endif
\ No newline at end of file
diff --git a/src/primate/include/_random_generator/_engines/pcg64.h b/src/primate/include/_random_generator/_engines/pcg64.h
new file mode 100644
index 0000000..d4f5e6e
--- /dev/null
+++ b/src/primate/include/_random_generator/_engines/pcg64.h
@@ -0,0 +1,55 @@
+// PCG64.h
+// Adapted from C-implementation: (c) 2014 M.E. O'Neill / pcg-random.org
+// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website)
+#ifndef _RANDOM_GENERATOR_PCG64_H_
+#define _RANDOM_GENERATOR_PCG64_H_
+
+#include <cstdint>  // uint64_t
+#include <cassert>  // assert
+#include <ctime>    // std::time
+#include <random>   // std::seed_seq
+#include <array>    // array
+
+struct Pcg64 { 
+  using result_type = uint64_t;
+  static constexpr size_t state_size = 2; // size of the entropy/state size, in units of 32-bits
+  uint64_t state;  
+  uint64_t inc; 
+
+  Pcg64(){
+    uint64_t seed = static_cast<uint64_t>(std::time(0)) + static_cast<uint64_t>(std::clock());
+    state = (seed << 32) | seed;
+    inc = (state << 1u) | 1u;
+  };
+
+  void seed(std::seed_seq& seed_s) {
+    std::array< std::uint32_t, 2 > seeds = { 0, 0 }; 
+    seed_s.generate(seeds.begin(), seeds.end());
+    state = static_cast< std::uint64_t >(seeds[0]);
+    state = state << 32; 
+    state = state | static_cast< std::uint64_t >(seeds[1]);
+    inc = (state << 1u) | 1u;
+  };
+  
+  uint32_t _generate32() noexcept {
+    uint64_t oldstate = state;
+    // Advance internal state
+    state = oldstate * 6364136223846793005ULL + (inc|1);
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+  };
+
+  uint64_t operator()() noexcept {
+    uint64_t z = _generate32();
+    z = z << 32; 
+    z = z | static_cast< std::uint64_t >(_generate32());
+    return z;
+  }
+
+  static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); }
+  static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); }
+};
+
+#endif  // _RANDOM_GENERATOR_PCG64_H_
diff --git a/src/primate/include/_random_generator/_engines/split_mix64.h b/src/primate/include/_random_generator/_engines/split_mix64.h
new file mode 100644
index 0000000..78eb7bd
--- /dev/null
+++ b/src/primate/include/_random_generator/_engines/split_mix64.h
@@ -0,0 +1,38 @@
+#ifndef _RANDOM_GENERATOR_SPLIT_MIX_64_H_
+#define _RANDOM_GENERATOR_SPLIT_MIX_64_H_
+
+#include <stdint.h>  // uint64_t
+#include <cassert>  // assert
+#include <ctime>  // std::time
+#include <random> // std::seed_seq
+#include <array> // array 
+
+
+// From: https://prng.di.unimi.it/splitmix64.c
+struct SplitMix64 {
+    // static constexpr typename result_type = uint64_t; 
+    using result_type = uint64_t;
+    static constexpr size_t state_size = 2; // size of the entropy/state size, in units of 32-bits
+    uint64_t state;
+    SplitMix64(){
+        uint64_t seed = static_cast<uint64_t>(std::time(0)) + static_cast<uint64_t>(std::clock());
+        state = (seed << 32) | seed;
+    };
+    void seed(std::seed_seq& state_) {
+        std::array< std::uint32_t, 2 > seeds = { 0, 0 }; 
+        state_.generate(seeds.begin(), seeds.end());
+        state = static_cast< std::uint64_t >(seeds[0]);
+        state = state << 32; 
+        state = state | static_cast< std::uint64_t >(seeds[1]);
+    };
+    uint64_t operator()() noexcept {
+        uint64_t z = (state += 0x9e3779b97f4a7c15);
+        z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+        z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+        return z ^ (z >> 31);
+    };
+    static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); }
+    static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); }
+};
+
+#endif  // _RANDOM_GENERATOR_SPLIT_MIX_64_H_
diff --git a/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h b/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h
new file mode 100644
index 0000000..e0acb78
--- /dev/null
+++ b/src/primate/include/_random_generator/_engines/xoshiro_256_star_star.h
@@ -0,0 +1,149 @@
+/*
+ *  SPDX-FileCopyrightText: Copyright 2021, Siavash Ameli <sameli@berkeley.edu>, Matt Piekenbrock <matt.piekenbrock@gmail.com>
+ *  SPDX-License-Identifier: BSD-3-Clause
+ *  SPDX-FileType: SOURCE
+ *
+ *  This program is free software: you can redistribute it and/or modify it
+ *  under the terms of the license found in the LICENSE.txt file in the root
+ *  directory of this source tree.
+ */
+
+
+#ifndef _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_
+#define _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_
+
+
+#include <array> 
+#include <stdint.h>  // uint64_t, UINT64_C
+#include <cstdlib>  // NULL
+#include "./split_mix64.h"  // SplitMix64
+
+// stdint.h in old compilers (e.g. gcc 4.4.7) does not declare UINT64_C macro.
+#ifndef UINT64_C
+    #define UINT64_C(c) static_cast<uint64_t>(c)
+#endif
+
+// =====================
+// Xoshiro 256 Star Star
+// =====================
+
+/// \class   Xoshiro256StarStar
+///
+/// \brief   Pseudo-random integer generator. This class generates 64-bit
+///          integer using Xoshiro256** algorithm.
+///
+/// \details The Xoshiro256** algorithm has 256-bit state space, and passes all
+///          statistical tests, including the BigCrush. The state of this class
+///          is initialized using \c SplitMix64 random generator.
+///
+///          A very similar method to Xoshiro256** is Xoshiro256++ which has
+///          the very same properties and speed as the Xoshiro256**. An
+///          alternative method is Xoshiro256+, which is 15% faster, but it
+///          suffers linear dependency of the lower 4 bits. It is usually used
+///          for generating floating numbers using the upper 53 bits and
+///          discard the lower bits.
+///
+///          The Xoshiro256** algorithm is develped by David Blackman and
+///          Sebastiano Vigna (2018) and the source code can be found at:
+///          https://prng.di.unimi.it/xoshiro256starstar.c
+///
+/// \sa      SplitMix64
+
+
+struct Xoshiro256StarStar{
+    using result_type = uint64_t;
+    static constexpr size_t state_size = 8;
+    static inline uint64_t rotation_left( const uint64_t x, int k){
+        return (x << k) | (x >> (64 - k));
+    };
+    std::array< uint64_t, 4 > state;
+
+    Xoshiro256StarStar(){
+        SplitMix64 split_mix_64;
+        state = { split_mix_64(), split_mix_64(), split_mix_64(), split_mix_64() };
+    };
+    void seed(std::seed_seq& S){
+        std::array< std::uint32_t, 8 > seeds;
+        S.generate(seeds.begin(), seeds.end());
+        state[0] = (static_cast< std::uint64_t >(seeds[0]) << 32) | seeds[1];
+        state[1] = (static_cast< std::uint64_t >(seeds[2]) << 32) | seeds[3];
+        state[2] = (static_cast< std::uint64_t >(seeds[4]) << 32) | seeds[5];
+        state[3] = (static_cast< std::uint64_t >(seeds[6]) << 32) | seeds[7];
+    };
+    uint64_t operator()(){
+        const uint64_t result = rotation_left(state[1] * 5, 7) * 9;
+        const uint64_t t = state[1] << 17;  
+        state[2] ^= state[0];
+        state[3] ^= state[1];
+        state[1] ^= state[2];
+        state[0] ^= state[3];
+        state[2] ^= t;
+        state[3] = rotation_left(state[3], 45);
+        return result;
+    };
+
+    void jump(){
+        static const uint64_t JUMP[] = {
+            0x180ec6d33cfd0aba,
+            0xd5a61266f0c9392c,
+            0xa9582618e03fc9aa,
+            0x39abdc4529b1661c
+        };
+
+        uint64_t s0 = 0;
+        uint64_t s1 = 0;
+        uint64_t s2 = 0;
+        uint64_t s3 = 0;
+
+        for (unsigned int i = 0; i < sizeof(JUMP) / sizeof(*JUMP); ++i) {
+            for (int b = 0; b < 64; ++b) {
+                if (JUMP[i] & UINT64_C(1) << b) {
+                    s0 ^= state[0];
+                    s1 ^= state[1];
+                    s2 ^= state[2];
+                    s3 ^= state[3];
+                }
+                this->operator()();
+            }
+        }
+
+        state[0] = s0;
+        state[1] = s1;
+        state[2] = s2;
+        state[3] = s3;
+    };
+    void long_jump(){
+        static const uint64_t LONG_JUMP[] = {
+            0x76e15d3efefdcbbf,
+            0xc5004e441c522fb3,
+            0x77710069854ee241,
+            0x39109bb02acbe635
+        };
+
+        uint64_t s0 = 0;
+        uint64_t s1 = 0;
+        uint64_t s2 = 0;
+        uint64_t s3 = 0;
+
+        for (unsigned int i = 0; i < sizeof(LONG_JUMP) / sizeof(*LONG_JUMP); ++i) {
+            for (int b = 0; b < 64; ++b) {
+                if (LONG_JUMP[i] & UINT64_C(1) << b) {
+                    s0 ^= state[0];
+                    s1 ^= state[1];
+                    s2 ^= state[2];
+                    s3 ^= state[3];
+                }
+                this->operator()();
+            }
+        }
+        state[0] = s0;
+        state[1] = s1;
+        state[2] = s2;
+        state[3] = s3;
+    };
+    static constexpr uint64_t min(){ return std::numeric_limits< uint64_t >::min(); }
+    static constexpr uint64_t max(){ return std::numeric_limits< uint64_t >::max(); }
+};
+
+
+#endif  // _RANDOM_GENERATOR_XOSHIRO_256_STAR_STAR_H_
diff --git a/src/primate/include/_random_generator/random_concepts.h b/src/primate/include/_random_generator/random_concepts.h
new file mode 100644
index 0000000..faf959e
--- /dev/null
+++ b/src/primate/include/_random_generator/random_concepts.h
@@ -0,0 +1,51 @@
+#ifndef _RANDOM_GENERATOR_RANDOM_CONCEPTS_H
+#define _RANDOM_GENERATOR_RANDOM_CONCEPTS_H
+
+#include <concepts>
+#include <random> // SeedSequence
+
+// std::initializer_list< std::uint32_t >
+template < typename T >
+concept LightRandom64Engine = requires(T rne, std::seed_seq& S) {
+  { rne() } -> std::same_as< std::uint64_t >;   // <engine>() yields an unsigned 64-bit integer 
+  { rne.seed(S) };                              // state is seedeable w/ pointer of integers
+} && std::default_initializable< T > && std::uniform_random_bit_generator< T >;
+
+template < typename T >
+concept Stateful64Engine = requires(T rne) {
+  T::state_size > 0;                            // has a known positive state size, in terms of number 32 unsigned integers
+} && LightRandom64Engine< T >;
+
+// See: https://stackoverflow.com/questions/39288595/why-not-just-use-random-device/
+template < typename T >
+concept ThreadSafeRBG = requires(T rbg, int tid) {
+  { rbg.initialize(tid) };
+  { rbg.next(tid) } -> std::convertible_to< std::uint_fast64_t >;
+};
+   
+struct Random64EngineConcept {
+  using result_type = uint64_t;
+  // virtual const void seed(int) const = 0;
+  virtual ~Random64EngineConcept() = default;
+  virtual void seed(std::seed_seq&) = 0;
+  virtual uint64_t operator()() = 0; 
+  virtual size_t state_size() const = 0;  
+  static constexpr uint64_t min() { return std::numeric_limits< uint64_t >::min(); }
+  static constexpr uint64_t max() { return std::numeric_limits< uint64_t >::max(); }
+};
+
+template < LightRandom64Engine T >
+struct Random64Engine : public Random64EngineConcept {
+  using result_type = uint64_t;
+  T rng;
+  Random64Engine() : rng(T()) { }
+  void seed(std::seed_seq& S) override { rng.seed(S); };
+  uint64_t operator()() override { return rng(); }
+  size_t state_size() const override {
+    if constexpr (Stateful64Engine< T >){ return std::max(T::state_size, size_t(1)); } 
+    return 1; 
+  }
+  // T* get_rng(){ return &rng; } 
+};
+
+#endif
\ No newline at end of file
diff --git a/src/primate/include/_random_generator/rne_engines.h b/src/primate/include/_random_generator/rne_engines.h
new file mode 100644
index 0000000..689ab3a
--- /dev/null
+++ b/src/primate/include/_random_generator/rne_engines.h
@@ -0,0 +1,17 @@
+#ifndef _RANDOM_GENERATOR_RNE_ENGINES_H
+#define _RANDOM_GENERATOR_RNE_ENGINES_H
+
+#include <random>  // mt
+// #include "_engines/pcg_random.h" // pcg64 -- this adds a non-trivial amount to compile time
+#include "_engines/pcg64.h"
+#include "_engines/split_mix64.h"  // SplitMix64
+#include "_engines/xoshiro_256_star_star.h" // Xoshiro256
+
+// -- REMOVED due to seeding and other issues 
+// From: https://www.pcg-random.org/posts/cpp-seeding-surprises.html
+// This is technically a biased generator, but with 64-bits of entropy it may be ok 
+// for many applications and it should be blazing fast besides
+// using knuth_lcg = std::linear_congruential_engine< uint64_t, 6364136223846793005U, 1442695040888963407U, 0U>;
+
+
+#endif 
\ No newline at end of file
diff --git a/src/primate/include/_random_generator/threadedrng64.h b/src/primate/include/_random_generator/threadedrng64.h
new file mode 100644
index 0000000..03bac25
--- /dev/null
+++ b/src/primate/include/_random_generator/threadedrng64.h
@@ -0,0 +1,93 @@
+#ifndef _RANDOM_GENERATOR_THREADEDRNG64_H_
+#define _RANDOM_GENERATOR_THREADEDRNG64_H_
+
+#include <stdint.h>  // uint64_t
+#include <cassert>  // assert
+#include <cstdlib>  // NULL
+#include <random>   // uniform_random_bit_generator, mt19937_64
+#include <functional> // std::function 
+#include "random_concepts.h" // LightRandom64Engine, Random64Engine
+#include "rne_engines.h" // all the engines
+
+enum RbEngine { sx = 0, xs = 1, pcg = 2, mt = 3 };
+
+// Thread-safe random number generator
+// This class constructs n_thread copies of a given *random number engine* type (a state machine with a transition + output function) using 
+// different seed sequences (seed_seq's) generated from the std::random_device, such that one can safely call ThreadedRNG64.next(tid) with the 
+// give thread id (tid) and obtain a uniformly random unsigned integer with at least 64-bits. 
+// NOTE: RandomNumberEngine == UniformRandomBitGenerator + has .seed(), default constructible, and other things
+// template< LightRandom64Engine RNE = std::mt19937_64 >
+struct ThreadedRNG64 {
+	static constexpr size_t num_bits = 64;
+	int num_threads;
+	const RbEngine engine_id; 
+	std::vector< Random64EngineConcept* > generators;
+	ThreadedRNG64(int engine = 0) : engine_id(static_cast< RbEngine >(engine)) {
+		// std::uniform_random_bit_generator RBG = std::random_device;
+		int num_threads_ = 1;
+		initialize(num_threads_);
+	};
+	explicit ThreadedRNG64(int num_threads_, int engine, int seed = -1) : engine_id(static_cast< RbEngine >(engine)) {
+		initialize(num_threads_, seed);
+	};
+	~ThreadedRNG64(){
+		for (int i = 0; i < num_threads; ++i){
+			delete generators[i]; // todo: see if this is causing heap corruption
+		}
+	}
+	auto next(int thread_id) -> std::uint64_t {
+		return generators[thread_id]->operator()();
+	}
+
+	void initialize(int num_threads_, int seed = -1){
+		// assert(num_threads_ > 0);
+		if (num_threads_ == 0){ return; }
+		num_threads = num_threads_;
+		generators = std::vector< Random64EngineConcept* >(num_threads, nullptr);
+		
+		// Make new generators
+		for (size_t i = 0; i < generators.size(); ++i){
+			switch(engine_id){
+				case sx:
+					static_assert(std::uniform_random_bit_generator< Random64Engine< SplitMix64 > >, "Wrapper RNG engine constraints not met");
+					generators[i] = new Random64Engine< SplitMix64 >();
+					break; 
+				case xs: 
+				static_assert(std::uniform_random_bit_generator< Random64Engine< Xoshiro256StarStar > >, "Wrapper RNG engine constraints not met");
+					generators[i] = new Random64Engine< Xoshiro256StarStar >();
+					break; 
+				case pcg: {
+					static_assert(std::uniform_random_bit_generator< Random64Engine< Pcg64 > >, "Wrapper RNG engine constraints not met");
+					generators[i] = new Random64Engine< Pcg64 >();
+					break; 
+				} 
+				case mt: 
+					generators[i] = new Random64Engine< std::mt19937_64 >();
+					break; 
+			}
+		}
+
+		// Seeds generators with sources of entropy from a RBG (e.g. random_device)
+		// This seeds the entire state vector of the corresponding RNE's state size / entropy source using rd
+		auto rdev = std::random_device();
+		auto mt = std::mt19937(seed);
+		std::function< std::uint_fast32_t() > rd;
+		if (seed < 0){
+			rd = [&rdev](){ return rdev(); };
+		} else {
+			rd = [&mt](){ return mt(); };
+		} 
+
+		// Saturate the full state of the generators with entropy
+		for (int i = 0; i < num_threads; ++i) {
+			const auto ssize = generators[i]->state_size(); 
+			std::vector< uint32_t > seed_data(ssize, 0);
+			std::generate_n(seed_data.begin(), ssize, rd); // generate evenly-distributed 32-bit seeds
+			std::seed_seq seed_gen(std::begin(seed_data), std::end(seed_data));
+			generators[i]->seed(seed_gen);
+		}
+	};
+};
+
+
+#endif  // _RANDOM_GENERATOR_THREADEDRNG64_H_
diff --git a/src/primate/include/_random_generator/vector_generator.h b/src/primate/include/_random_generator/vector_generator.h
new file mode 100644
index 0000000..392d957
--- /dev/null
+++ b/src/primate/include/_random_generator/vector_generator.h
@@ -0,0 +1,83 @@
+#ifndef _RANDOM_GENERATOR_RANDOM_ARRAY_GENERATOR_H_
+#define _RANDOM_GENERATOR_RANDOM_ARRAY_GENERATOR_H_
+
+#include <random>   // uniform_random_bit_generator, normal_distribution
+#include <cstdint>  // uint64_t
+#include <cmath> // isnan
+#include <bitset>   // bitset
+#include "omp_support.h" // conditionally enables openmp pragmas
+#include "random_concepts.h" // ThreadSafeRBG
+// #include "threadedrng64.h"	
+// #include "rne_engines.h"		// all the engines
+
+using IndexType = unsigned int;
+using LongIndexType = unsigned long;
+static constexpr long num_bits = 64; 
+
+enum Distribution { rademacher = 0, normal = 1, rayleigh = 2 };
+
+// SIMD-vectorized rademacher vector generation; makes N // 64 calls to random bit generator
+template< std::floating_point F, ThreadSafeRBG RBG > 
+void generate_rademacher(const long n, RBG& random_bit_generator, const int thread_id, F* array, F& arr_norm){
+	const auto N = static_cast< size_t >(n / RBG::num_bits);
+	for (size_t i = 0; i < N; ++i) {
+		std::bitset< RBG::num_bits > ubits { random_bit_generator.next(thread_id) };
+		#pragma omp simd 
+		for (size_t j = 0; j < RBG::num_bits; ++j) {
+			array[i*RBG::num_bits + j] = 2 * int(ubits[j]) - 1;
+		}
+	}
+	// This loop should have less than 64 iterations.
+	std::bitset< RBG::num_bits > ubits { random_bit_generator.next(thread_id) };
+	for (size_t j = N * RBG::num_bits, i = 0; j < size_t(n); ++j, ++i){
+		array[j] = (2 * int(ubits[i]) - 1);
+	}
+	arr_norm = static_cast< F >(std::sqrt(n)); 
+}
+
+// Zero mean, unit variance gaussian
+template< std::floating_point F, ThreadSafeRBG RBG > 
+void generate_normal(const unsigned long n, RBG& bit_generator, const int thread_id, F* array, F& arr_norm){
+	static std::normal_distribution d { 0.0, 1.0 };
+	// const auto N = static_cast< unsigned long >(n / RBG::num_bits);
+	auto& gen = *bit_generator.generators[thread_id];
+	// #pragma omp simd
+	for (auto i = static_cast< unsigned long >(0); i < n; ++i){
+		array[i] = d(gen);
+	}
+	// // This loop should have less than num_bits iterations.
+	// for (auto j = static_cast< unsigned long >(N * RBG::num_bits), i = static_cast< unsigned long >(0); j < n; ++j, ++i){
+	// 	array[j] = d(gen);
+	// }
+	
+	arr_norm = 0.0; 
+	// #pragma omp simd reduction(+:arr_norm)
+	for (unsigned long i = 0; i < n; ++i){
+		arr_norm += std::pow(array[i], 2);
+	}
+	arr_norm = std::sqrt(arr_norm);
+}
+
+// Generates an isotropic random vector for a given thread id
+template< std::floating_point F, ThreadSafeRBG RBG > 
+void generate_isotropic(const Distribution dist_id, const long n, RBG& rbg, const int thread_id, F* array, F& arr_norm){
+	switch(dist_id){
+		case rademacher:
+			generate_rademacher(n, rbg, thread_id, array, arr_norm);
+			break; 
+		case normal: 
+			generate_normal(n, rbg, thread_id, array, arr_norm);
+			break; 
+		case rayleigh: 
+			// generate_rayleigh(n, bit_generator, thread_id, array, arr_norm);
+			break; 
+	}
+
+	// Ensure the array has unit norm
+	const F arr_norm_inv = 1.0 / arr_norm;
+	#pragma omp simd
+	for (auto i = static_cast< long >(0); i < n; ++i){
+		array[i] *= arr_norm_inv;
+	}
+}
+#endif 
\ No newline at end of file
diff --git a/src/primate/include/_trace/hutch.h b/src/primate/include/_trace/hutch.h
new file mode 100644
index 0000000..4d19f2e
--- /dev/null
+++ b/src/primate/include/_trace/hutch.h
@@ -0,0 +1,332 @@
+#ifndef _TRACE_HUTCH_H
+#define _TRACE_HUTCH_H
+
+#include <concepts> 
+#include <functional> // function
+#include <algorithm>  // max
+#include <cmath> // constants, isnan
+#include "omp_support.h" // conditionally enables openmp pragmas
+// #include "_operators/linear_operator.h" // LinearOperator
+// #include "_orthogonalize/orthogonalize.h"   // orth_vector, mod
+#include "_random_generator/vector_generator.h" // ThreadSafeRBG, generate_array, Distribution
+#include "_lanczos/lanczos.h"
+#include "eigen_core.h"
+#include <Eigen/Eigenvalues> 
+#include <iostream> 
+#include <chrono>
+
+using us = std::chrono::microseconds;
+using dur_seconds = std::chrono::duration< double >;
+using hr_clock = std::chrono::high_resolution_clock;
+
+template < int Iterate = 3 >
+double erf_inv(double x) noexcept {
+  // Strategy: solve f(y) = x - erf(y) = 0 for given x with Newton's method.
+  // f'(y) = -erf'(y) = -2/sqrt(pi) e^(-y^2)
+  // Has quadratic convergence, achieving machine precision with ~three iterations.
+  if constexpr(Iterate == 0){
+    // Specialization to get initial estimate; accurate to about 1e-3.
+    // Based on https://stackoverflow.com/questions/27229371/inverse-error-function-in-c
+    const double a = std::log((1 - x) * (1 + x));
+    const double b = std::fma(0.5, a, 4.120666747961526);
+    const double c = 6.47272819164 * a;
+    return std::copysign(std::sqrt(-b + std::sqrt(std::fma(b, b, -c))), x);
+  } else {
+    const double x0 = erf_inv< Iterate - 1 >(x); // compile-time recurse
+    const double fx0 = x - std::erf(x0);
+    const double pi = std::acos(-1);
+    double fpx0 = -2.0 / std::sqrt(pi) * std::exp(-x0 * x0);
+    return x0 - fx0 / fpx0; // = x1
+  } 
+}
+
+// Work-around to avoid copying for the multi-threaded build
+template< LinearOperator Matrix > 
+auto get_matrix(const Matrix& A){
+  // const auto M = is_instance< Matrix, MatrixFunction >{} ? 
+  if constexpr(HasOp< Matrix >){
+    return MatrixFunction(A.op, A.f, A.deg, A.rtol, A.orth, A.ncv, A.wgt_method);
+  } else {
+    return A; 
+  } 
+}
+
+auto get_num_threads(int nt) -> int {
+  const auto max_threads = omp_get_max_threads();
+  nt = nt <= 0 ? max_threads : nt; 
+  return std::max(1, int(nt));
+}   
+
+// std::function< bool(int) >
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda, typename Lambda2 >
+void monte_carlo_quad(
+  const Matrix& A,                              // Any linear operator supporting .matvec() and .shape() methods
+  const Lambda& f_cb,                           // Thread-safe callback function with signature f(int i, F sample, F* q)
+  const Lambda2& stop_check,                    // Function to check for convergence or early-stopping (takes no arguments)
+  const int nv,                                 // Number of sample vectors to generate
+  const Distribution dist,                      // Isotropic distribution used to generate random vectors
+  RBG& rng,                                     // Random bit generator
+  const int num_threads,                        // Number of threads used to parallelize the computation   
+  const int seed,                               // Seed for random number generator for determinism
+  size_t& wall_time                             // Wall clock time
+){
+  using VectorF = Eigen::Matrix< F, Dynamic, 1 >;
+  // using ArrayF = Eigen::Array< F, Dynamic, 1 >;
+  
+  // Constants
+  const auto A_shape = A.shape();
+  const size_t n = A_shape.first;
+  const size_t m = A_shape.second;
+
+  // Set the number of threads + initialize multi-threaded RNG
+  const auto nt = get_num_threads(num_threads);
+  omp_set_num_threads(nt);
+  rng.initialize(nt, seed);
+
+  // Using square-root of max possible chunk size for parallel schedules
+  unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1);
+  
+  // Monte-Carlo ensemble sampling
+  int i;
+  volatile bool stop_flag = false; // early-stop flag for convergence checking
+  const auto t_start = hr_clock::now();
+  #pragma omp parallel shared(A, stop_flag)
+  {
+    int tid = omp_get_thread_num(); // thread-id 
+    
+    auto q_norm = static_cast< F >(0.0);
+    auto q = static_cast< VectorF >(VectorF::Zero(m));
+    
+    // Parameterize the matrix function
+    // TODO: this is not a great way to avoid copying matrices / matrix functions, but is needed as 
+    // the former is read-only and can be shared amongst threads, but the latter needs thread-specific storage
+    const auto M = get_matrix(A); 
+
+    #pragma omp for schedule(dynamic, chunk_size)
+    for (i = 0; i < nv; ++i){
+      if (stop_flag){ continue; }
+
+      // Generate isotropic vector (w/ unit norm)
+      generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm);
+      
+      // Apply quadratic form, using quad() if available
+      F sample = 0.0;  
+      if constexpr (QuadOperator< Matrix >){
+        sample = M.quad(q.data()); // x^T A x
+        // std::cout << "quad output: " << sample << std::endl;
+      } else {
+        auto y = static_cast< VectorF >(VectorF::Zero(n));
+        A.matvec(q.data(), y.data()); 
+        sample = y.dot(q);
+      }
+
+      // Run the user-supplied function (parallel section!)
+      f_cb(i, sample, q.data());
+      
+      // If supplied, check early-stopping condition
+      #pragma omp critical
+      {
+        stop_flag = stop_check(i);
+      }
+    } // omp for
+  } // omp parallel 
+  const auto t_end = hr_clock::now();
+  wall_time = duration_cast< us >(dur_seconds(t_end - t_start)).count();
+}
+
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG >
+auto hutch(
+  const Matrix& A,
+  RBG& rbg, const int nv, const int dist, const int engine_id, const int seed,
+  const F atol, const F rtol, 
+  const int num_threads,
+  const bool use_CLT, 
+  const F* t_scores, 
+  const F z, 
+  F* estimates, 
+  size_t& wall_time
+) -> F {  
+  using ArrayF = Eigen::Array< F, Dynamic, 1 >;
+  
+  // Save the sample estimates
+  const auto save_sample = [&estimates](int i, F sample, [[maybe_unused]] F* q){
+    estimates[i] = sample;
+  };
+  
+  // Type-erased function since the call is cheap
+  // std::function< bool (int) > early_stop; // apparently this will cause problems!
+  if (atol == 0.0 && rtol == 0.0){
+    const auto early_stop = [](int i) -> bool { return false; };
+    monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time);
+    Eigen::Map< ArrayF > est(estimates, nv);
+    est *= A.shape().first;
+    // std::cout << "est sum: " << est.sum() << ", nv: " << nv << std::endl; 
+    F mu_est = est.sum() / nv;  
+    return mu_est;
+  } else if (use_CLT){
+    // Parameterize when to stop using either the CLT over the given confidence level or 
+    // This runs in critical section of the SLQ, so we can depend sequential execution (but i will vary!)
+    // See: https://math.stackexchange.com/questions/102978/incremental-computation-of-standard-deviation
+    F mu_est = 0.0, vr_est = 0.0; 
+    F mu_pre = 0.0, vr_pre = 0.0; 
+    int n_samples = 0; // number of estimates computed
+    // const auto z = std::sqrt(2.0) * erf_inv< 3 >(double(0.95));
+    const auto early_stop = [&estimates, &mu_est, &vr_est, &mu_pre, &vr_pre, &n_samples, &t_scores, z, atol, rtol](int i) -> bool {
+      if (std::isnan(estimates[i])){ return false; }
+      ++n_samples; 
+      const F denom = (1.0 / F(n_samples));
+      const F L = n_samples > 2 ? F(n_samples-2) / F(n_samples-1) : 0.0;
+      mu_est = denom * (estimates[i] + (n_samples - 1) * mu_pre);
+      mu_pre = n_samples == 1 ? mu_est : mu_pre;
+      vr_est = L * vr_pre + denom * std::pow(estimates[i] - mu_pre, 2); // update sample variance
+      mu_pre = mu_est;
+      vr_pre = vr_est;
+      if (n_samples < 3){
+        return false; 
+      } else {
+        const auto sd_est = std::sqrt(vr_est);
+        const auto score = n_samples < 30 ? t_scores[n_samples] : z;
+        const auto margin_of_error = score * sd_est / std::sqrt(F(n_samples)); // todo: remove sqrt's 
+        // std::cout << "n: " << n << ", mu: " << mu_est << ", ci: [" << mu_est - margin_of_error << ", " << mu_est + margin_of_error << "]";
+        // std::cout << "margin/atol: " << margin_of_error << ", " << atol << "\n";
+        return margin_of_error <= atol || std::abs(sd_est / mu_est) <= rtol;
+      }
+    };
+    monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time);
+    Eigen::Map< ArrayF > est(estimates, nv);
+    est *= A.shape().first;
+    return mu_est * A.shape().first;
+  } else {
+    // Use traditional iteration checking, akin to scipy.integrate.quadrature
+    F mu_est = 0.0, mu_pre = 0.0;
+    int n_samples = 0; 
+    const auto early_stop = [&estimates, &n_samples, &mu_est, &mu_pre, atol, rtol](int ii) -> bool { // &estimates, &n_samples, &mu_est, &mu_pre, atol, rtol
+      if (std::isnan(estimates[ii])){ return false; }
+      ++n_samples; 
+      const F denom = (1.0 / F(n_samples));
+      mu_est = denom * (estimates[ii] + (n_samples - 1) * mu_pre);
+      const bool atol_check = std::abs(mu_est - mu_pre) <= atol;
+      const bool rtol_check = (std::abs(mu_est - mu_pre) / mu_est) <= rtol; 
+      mu_pre = mu_est;
+      // std::cout << std::fixed << std::showpoint;
+      // std::cout << std::setprecision(10);
+      // std::cout << "n: " << n_samples << ", mu: " << mu_est << ", atol: " << std::abs(mu_est - mu_pre) << ", rtol: " << (std::abs(mu_est - mu_pre) / mu_est) << std::endl; 
+      return atol_check || rtol_check;
+    };
+    monte_carlo_quad< F >(A, save_sample, early_stop, nv, static_cast< Distribution >(dist), rbg, num_threads, seed, wall_time);
+    Eigen::Map< ArrayF > est(estimates, nv);
+    est *= A.shape().first;
+    return mu_est * A.shape().first; 
+  }
+  // Don't pull down monte_carlo_quad; the early-stop defs need to be local scope for some reason!
+}
+
+
+// Stochastic Lanczos quadrature method
+// std::function<F(int,F*,F*)>
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG, typename Lambda >
+void slq (
+  const Matrix& A,                            // Any linear operator supporting .matvec() and .shape() methods
+  const Lambda& f,                            // Thread-safe function with signature f(int i, F* nodes, F* weights)
+  const std::function< bool(int) >& stop_check, // Function to check for convergence or early-stopping (takes no arguments)
+  const int nv,                               // Number of sample vectors to generate
+  const Distribution dist,                    // Isotropic distribution used to generate random vectors
+  RBG& rng,                                   // Random bit generator
+  const int lanczos_degree,                   // Polynomial degree of the Krylov expansion
+  const F lanczos_rtol,                       // residual tolerance to consider subspace A-invariant
+  const int orth,                             // Number of vectors to re-orthogonalize against <= lanczos_degree
+  const int ncv,                              // Number of Lanczos vectors to keep in memory (per-thread)
+  const int num_threads,                      // Number of threads used to parallelize the computation   
+  const int seed                              // Seed for random number generator for determinism
+){   
+  using ArrayF = Eigen::Array< F, Dynamic, 1 >;
+  using EigenSolver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >; 
+  if (ncv < 2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2."); }
+  if (ncv < orth+2){ throw std::invalid_argument("Invalid number of lanczos vectors supplied; must be >= 2+orth."); }
+
+  // Constants
+  const auto A_shape = A.shape();
+  const size_t n = A_shape.first;
+  const size_t m = A_shape.second;
+
+  // Set the number of threads + initialize multi-threaded RNG
+  const auto nt = num_threads <= 0 ? omp_get_max_threads() : num_threads;
+  omp_set_num_threads(nt);
+  rng.initialize(nt, seed);
+
+  // Using square-root of max possible chunk size for parallel schedules
+  unsigned int chunk_size = std::max(int(sqrt(nv / nt)), 1);
+  
+  // Monte-Carlo ensemble sampling
+  int i;
+  volatile bool stop_flag = false; // early-stop flag for convergence checking
+  #pragma omp parallel shared(stop_flag)
+  {
+    int tid = omp_get_thread_num(); // thread-id 
+
+    // Pre-allocate memory needed for Lanczos iterations
+    auto q_norm = static_cast< F >(0.0);
+    auto q = static_cast< ArrayF >(ArrayF::Zero(m)); 
+    auto Q = static_cast< DenseMatrix< F > >(DenseMatrix< F >::Zero(n, ncv));
+    auto alpha = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
+    auto beta = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree+1));
+    auto nodes = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
+    auto weights = static_cast< ArrayF >(ArrayF::Zero(lanczos_degree));
+    auto solver = EigenSolver(lanczos_degree);
+    
+    // Run in parallel 
+    #pragma omp for schedule(dynamic, chunk_size)
+    for (i = 0; i < nv; ++i){
+      if (stop_flag){ continue; }
+
+      // Generate isotropic vector (w/ unit norm)
+      generate_isotropic< F >(dist, m, rng, tid, q.data(), q_norm);
+      
+      // Perform a lanczos iteration (populates alpha, beta)
+      lanczos_recurrence< F >(A, q.data(), lanczos_degree, lanczos_rtol, orth, alpha.data(), beta.data(), Q.data(), ncv); 
+
+      // Obtain nodes + weights via quadrature algorithm
+      lanczos_quadrature< F >(alpha.data(), beta.data(), lanczos_degree, solver, nodes.data(), weights.data());
+
+      // Run the user-supplied function (parallel section!)
+      f(i, q.data(), Q.data(), nodes.data(), weights.data());
+      
+      // If supplied, check early-stopping condition
+      #pragma omp critical
+      {
+        stop_flag = stop_check(i);
+      }
+    }
+  }
+}
+
+template< std::floating_point F, LinearOperator Matrix, ThreadSafeRBG RBG > 
+void sl_quadrature(
+  const Matrix& mat, 
+  RBG& rbg, const int nv, const int dist, const int engine_id, const int seed,
+  const int lanczos_degree, const F lanczos_rtol, const int orth, const int ncv,
+  const int num_threads, 
+  F* quad_nw
+){
+  using VectorF = Eigen::Array< F, Dynamic, 1>;
+
+  // Parameterize the quadrature function
+  Eigen::Map< DenseMatrix< F >> quad_nw_map(quad_nw, lanczos_degree * nv, 2);
+  const auto quad_f = [lanczos_degree, &quad_nw_map](int i, [[maybe_unused]] F* q, [[maybe_unused]] F* Q, F* nodes, F* weights){
+    // printf("iter %0d, tid %0d, q[0] = %.4g, nodes[0] = %.4g\n", i, omp_get_thread_num(), q[0], nodes[0]);
+    Eigen::Map< VectorF > nodes_v(nodes, lanczos_degree, 1);        // no-op
+    Eigen::Map< VectorF > weights_v(weights, lanczos_degree, 1);    // no-op
+    quad_nw_map.block(i*lanczos_degree, 0, lanczos_degree, 1) = nodes_v;
+    quad_nw_map.block(i*lanczos_degree, 1, lanczos_degree, 1) = weights_v;
+  };
+
+  // Parameterize when to stop (run in critical section)
+  constexpr auto early_stop = [](int i) -> bool {
+    return false; 
+  };
+
+  // Execute the stochastic Lanczos quadrature
+  slq< F >(mat, quad_f, early_stop, nv, static_cast< Distribution >(dist), rbg, lanczos_degree, lanczos_rtol, orth, ncv, num_threads, seed);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/primate/include/eigen_core.h b/src/primate/include/eigen_core.h
new file mode 100644
index 0000000..4482667
--- /dev/null
+++ b/src/primate/include/eigen_core.h
@@ -0,0 +1,20 @@
+#ifndef __EIGEN_CORE_H
+#define __EIGEN_CORE_H
+
+#include <Eigen/Core>
+
+using Eigen::Dynamic; 
+using Eigen::Ref; 
+using Eigen::MatrixXf;
+using Eigen::MatrixXd;
+
+template< typename F >
+using Vector = Eigen::Matrix< F, Dynamic, 1 >;
+
+template< typename F >
+using Array = Eigen::Array< F, Dynamic, 1 >;
+
+template< typename F >
+using DenseMatrix = Eigen::Matrix< F, Dynamic, Dynamic >;
+
+#endif
\ No newline at end of file
diff --git a/src/primate/include/omp_support.h b/src/primate/include/omp_support.h
new file mode 100644
index 0000000..ffee824
--- /dev/null
+++ b/src/primate/include/omp_support.h
@@ -0,0 +1,13 @@
+#ifndef _OMP_SUPPORT_H
+#define _OMP_SUPPORT_H
+
+#ifdef OMP_MULTITHREADED
+   #include <omp.h>   // omp_set_num_threads, omp_get_thread_num
+#else
+   #define omp_get_thread_num() 0
+	 #define omp_set_num_threads(x) 0
+   #define omp_get_max_threads() 1 
+#endif
+
+
+#endif
\ No newline at end of file
diff --git a/src/primate/trace.py b/src/primate/trace.py
index b454fcc..ef378f2 100644
--- a/src/primate/trace.py
+++ b/src/primate/trace.py
@@ -272,7 +272,7 @@ def hutchpp(
 		return 0
 
 	## Convert to a matrix function, if not already 
-	A = matrix_function(A, fun=fun)
+	A = matrix_function(A, fun=fun, **kwargs) if fun is not None else A
 
 	## Setup constants 
 	verbose, info = kwargs.get('verbose', False), kwargs.get('info', False)
@@ -280,7 +280,7 @@ def hutchpp(
 	nb = (N // 3) if nb == "auto" else nb							# number of samples to dedicate to deflation
 	m = (N // 3) if maxiter == "auto" else maxiter 	  # residual samples; default rule uses Hutch++ result
 	f_dtype = (A @ np.zeros(A.shape[1])).dtype if not hasattr(A, "dtype") else A.dtype
-	info_dict = {}
+	info_dict = { }
 
 	## Sketch Y / Q - use numpy for now, but consider parallelizing MGS later
 	WB = np.random.choice([-1.0, +1.0], size=(N, nb)).astype(f_dtype)
@@ -296,14 +296,22 @@ def hutchpp(
 		defl_ests = (Q.T @ (A @ Q)).diagonal()
 		tr_defl = np.sum(defl_ests)
 	else:
-		tr_defl, defl_ests = A.quad_sum(Q)
+		tr_defl, defl_ests = _trace.quad_sum(A, Q) if fun is None else A.quad_sum(Q)
 
 	## Estimate trace of the residual via Girard-Hutchinson on the 
 	## complement of the deflated subspaces, (I - Q @ Q.T)y
-	A.deflate(Q)
-	kwargs['info'] = True
-	kwargs['verbose'] = False
-	tr_resi, info_dict = hutch(A, **kwargs)
+	if fun is None or mode == 'full':
+		## Full mode == form the full (m x m) matrix and take the diagonal 
+		## Note memory efficient, but is potentially vectorized, so suitable for relatively small m
+		WM = np.random.choice([-1.0, +1.0], size=(N, m)).astype(f_dtype)
+		G = WM - Q @ (Q.T @ WM)
+		tr_resi = (1 / m) * (G.T @ (A @ G)).trace()
+	else: 
+		A.deflate(Q)
+		kwargs['info'] = True
+		kwargs['verbose'] = False
+		tr_resi, ID = hutch(A, maxiter=maxiter, **kwargs)
+		info_dict.update(ID)
 
 	# if mode == 'full': 
 	# 	## Full mode == form the full (m x m) matrix and take the diagonal 
diff --git a/tests/test_trace.py b/tests/test_trace.py
index 3505558..e9f65b3 100644
--- a/tests/test_trace.py
+++ b/tests/test_trace.py
@@ -196,11 +196,11 @@ def test_quad_sum():
 	n = 100
 	A = symmetric(n)
 	Q = np.linalg.qr(A)[0]
-	test_quads = _trace.quad_sum(A, Q)
+	_, test_quads = _trace.quad_sum(A, Q)
 	true_quads = (Q.T @ (A @ Q)).diagonal()
 	assert np.allclose(test_quads, true_quads)
 	assert np.isclose(np.sum(test_quads), np.sum(true_quads))
-	test_quads = _trace.quad_sum(A, Q[:,:10])
+	_, test_quads = _trace.quad_sum(A, Q[:,:10])
 	true_quads = (Q[:,:10].T @ (A @ Q[:,:10])).diagonal()
 	assert np.allclose(test_quads, true_quads)
 	assert np.isclose(np.sum(test_quads), np.sum(true_quads))
@@ -216,6 +216,8 @@ def test_hutch_pp():
 	test_trace = hutchpp(A, mode="full", seed=1)
 	assert np.isclose(test_trace, true_trace, atol=1.0)
 
+	tr_true = np.sum(np.exp(np.linalg.eigh(A)[0]))
+	hutchpp(A, fun=np.exp, nb=100, mode="reduced", seed=1)
 	# import timeit
 	# timeit.timeit(lambda: hutchpp(A, mode="full"), number=1000)
 	# timeit.timeit(lambda: hutchpp(A, mode="reduced"), number=1000)