From 2a8bc42797a530946cc0f98000c6f9378436e2a7 Mon Sep 17 00:00:00 2001
From: peekxc <matt.piekenbrock@gmail.com>
Date: Sun, 31 Dec 2023 13:39:58 -0500
Subject: [PATCH] Finally figured FTTR was not bugged, but rather is simply
 unstable. Minor updates to docs

---
 docs/src/basic/install.qmd             |  8 +--
 docs/src/basic/integration.qmd         | 24 ++------
 docs/src/theory/intro.qmd              | 10 ++--
 include/_lanczos/lanczos.h             | 18 +++---
 include/_orthogonalize/orthogonalize.h | 66 ++++++++++++++++++----
 meson.build                            | 28 ++++-----
 src/primate/_lanczos.cpp               | 15 ++---
 src/primate/_orthogonalize.cpp         | 45 ++++++++++++++-
 tests/sanity.py                        | 56 +++++++++---------
 tests/test_lanczos.py                  | 78 ++++++++++++++++++++++++++
 10 files changed, 250 insertions(+), 98 deletions(-)

diff --git a/docs/src/basic/install.qmd b/docs/src/basic/install.qmd
index 3fbaa6d..1a09978 100644
--- a/docs/src/basic/install.qmd
+++ b/docs/src/basic/install.qmd
@@ -2,16 +2,16 @@
 title: "Installation"
 ---
 
-`primate` is a standard [PEP-517](https://peps.python.org/pep-0517/) package, and thus can be installed via `pip`: 
+`primate` is a standard [PEP-517](https://peps.python.org/pep-0517/) package that can be installed via `pip`: 
 
 ```python
 python -m pip install scikit-primate  
 ```
 
-Assuming your platform is natively supported, no compilation is needed; see [platform support](#platform-support) for details. 
+Assuming your platform is supported, no compilation is needed---see [platform support](#platform-support) for details. 
 
 :::{.callout-note}
-Like many packages registered on PyPI, the distribution name "`scikit-primate`" differs from the importable package name "`primate`" (also see [#3471](https://github.com/pypi/support/issues/3471)). Once installed, all exported package modules are available through the `primate` module. 
+Like many packages registered on PyPI, the _distribution_ "`scikit-primate`" differs from the importable _package_ "`primate`" (also see [#3471](https://github.com/pypi/support/issues/3471)). Additionally, `primate` does not rely on organizational prefixes that some [scikits](https://projects.scipy.org/scikits.html) use (e.g. `scikit-learn` -> `sklearn`). 
 :::
 
 <!-- Currently the package must be built from source via cloning the repository. PYPI support is planned.  -->
@@ -40,7 +40,7 @@ If your platform isn't on this table but you would like it to be supported, feel
 
 ### Compiling from source
 
-To install the package from its source distribution, a C++20 compiler is required; the current builds are all built with some variant of [clang](https://clang.llvm.org/), preferably version 15.0 or higher. For platform- and compiler-specific settings, consult the build scripts and CI configuration files.
+A C++20 compiler is required to install the package from its source distribution. Current builds all compile with some variant of [clang](https://clang.llvm.org/) (version 15.0+). For platform- and compiler-specific settings, consult the [build scripts](https://github.com/peekxc/primate/blob/main/meson.build) and [CI configuration files](https://github.com/peekxc/primate/tree/main/.github/workflows).
 
 ### C++ Installation
 
diff --git a/docs/src/basic/integration.qmd b/docs/src/basic/integration.qmd
index 6f3bdda..0d84463 100644
--- a/docs/src/basic/integration.qmd
+++ b/docs/src/basic/integration.qmd
@@ -2,9 +2,9 @@
 title: "Integration"
 ---
 
-`primate` supports a variety of matrix-types of the box, including numpy `ndarray`'s, compressed [sparse matrices](https://docs.scipy.org/doc/scipy/reference/sparse.html) (a lá SciPy), and [LinearOperators](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html)---the latter enables the use of _matrix free_ operators.
+`primate` supports a variety of matrix-types of the box, including numpy `ndarray`'s, compressed [sparse matrices](https://docs.scipy.org/doc/scipy/reference/sparse.html) (a lá SciPy), and [LinearOperators](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html).
 
-Outside of the natively types above, the basic requirements for any operator `A` to be used with e.g. the `Lanczos` method in `primate` are: 
+More generally, the basic requirements for any operator `A` to be used with e.g. the `Lanczos` method are: 
 
 1. A method `A.matvec(input: ndarray) -> ndarray` implementing $v \mapsto Av$
 2. An attribute `A.shape -> tuple[int, int]` giving the output/input dimensions of $A$
@@ -12,7 +12,7 @@ Outside of the natively types above, the basic requirements for any operator `A`
 <!-- Note that in the matrix setting, `A.shape()` yields `(A.nrow(), A.ncol())`, so existing matrix implementations like [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)'s and [sparray](https://docs.scipy.org/doc/scipy/reference/sparse.html#sparse-array-classes)'s natively support the interface. -->
 
 <!-- Using [PyLops](adthedocs.io/en/stable/index.html) terminology, any self-adjoint operator supporting _forward model_ of matrix-vector multiplication is compatible with most of `primate`'s API.  -->
-Here's an example of a simple operator representing a Diagonal matrix, which inherits a `.matvec()` method by following the subclassing rules of [SciPy's LinearOperator](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html):
+Here's an example of a simple operator representing a Diagonal matrix:
 
 ```python
 import numpy as np 
@@ -31,23 +31,11 @@ class DiagonalOp(LinearOperator):
     out = self.diag * np.ravel(x)
     return out.reshape(x.shape)
 ```
-
-<!-- Sure enough, you 
-
-```python
-from scipy.sparse.linalg import eigsh
-
-D = DiagonalOp(np.arange(10))
-np.allclose(
-  eigsh(DiagonalOp(v), k = 9, return_eigenvectors=False),
-  np.arange(1, 10)
-) 
-## True
-``` -->
+Note that by following the subclassing rules of [SciPy's LinearOperator](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.LinearOperator.html), this class inherits a `.matvec()` method and thus satisfies the constraints above. 
 
 ## C++ usage 
 
-Similarly, to get started calling any matrix-free function provided by `primate` on the C++ side, such `hutch` or `lanczos`, simply pass any type with `.shape()` and `.matvec()` member functions: 
+Similarly, to call any C++ function, such `hutch` or `lanczos`, simply pass any type with `.shape()` and `.matvec()` member functions: 
 
 ```cpp
 class LinOp {
@@ -65,4 +53,4 @@ class LinOp {
 
 It's up to you to ensure `shape()` yields the correct size; `primate` will supply vectors to `input` of size `.shape().second` (number of columns) and guarantees the pointer to the `output` will be at least `shape().first` (number of rows), no more. 
 
-To read more about how semantics extend to the C++ side as well via _C++20 concepts_---see the [C++ integration guide](../advanced/cpp_integration.qmd). If you're using pybind11 and you want to extend `primate`'s Python API to work natively with linear operator implemented in C++, see the [pybind11 integration guide](../advanced/pybind11_integration.qmd). 
\ No newline at end of file
+To read more about how semantics extend to the C++ side via _C++20 concepts_, see the [C++ integration guide](../advanced/cpp_integration.qmd). If you're using pybind11 and you want to extend `primate`'s Python API to work natively with linear operator implemented in C++, see the [pybind11 integration guide](../advanced/pybind11_integration.qmd). 
\ No newline at end of file
diff --git a/docs/src/theory/intro.qmd b/docs/src/theory/intro.qmd
index ac567c5..27081df 100644
--- a/docs/src/theory/intro.qmd
+++ b/docs/src/theory/intro.qmd
@@ -10,11 +10,11 @@ Many quantities of common interest can be expressed in as traces of matrix funct
 
 $$
 \begin{align*}
-f &= \mathrm{id} \quad &\Longleftrightarrow& \quad &\mathrm{tr}(A) \\
-f &= f^{-1} \quad &\Longleftrightarrow& \quad &\mathrm{tr}(A^{-1}) \\
-f &= \log \quad &\Longleftrightarrow& \quad  &\mathrm{logdet}(A) \\
-f &= \mathrm{exp} \quad &\Longleftrightarrow& \quad &\mathrm{exp}(A) \\
-f &= \mathrm{sign} \quad &\Longleftrightarrow& \quad &\mathrm{rank}(A)  \\
+f(\lambda) &= \lambda \quad &\Longleftrightarrow& \quad &\mathrm{tr}(A) \\
+f(\lambda) &= \lambda^{-1} \quad &\Longleftrightarrow& \quad &\mathrm{tr}(A^{-1}) \\
+f(\lambda) &= \log(\lambda) \quad &\Longleftrightarrow& \quad  &\mathrm{logdet}(A) \\
+f(\lambda) &= \mathrm{exp}(\lambda) \quad &\Longleftrightarrow& \quad &\mathrm{exp}(A) \\
+f(\lambda) &= \mathrm{sign}(\lambda) \quad &\Longleftrightarrow& \quad &\mathrm{rank}(A)  \\
 &\vdots \quad &\hphantom{\Longleftrightarrow}& \quad & \vdots & 
 \end{align*}
 $$ 
diff --git a/include/_lanczos/lanczos.h b/include/_lanczos/lanczos.h
index cc724a8..cdd0432 100644
--- a/include/_lanczos/lanczos.h
+++ b/include/_lanczos/lanczos.h
@@ -49,7 +49,7 @@ void lanczos_recurrence(
   const F rtol,               // Tolerance of residual error for early-stopping the iteration.
   const int orth,             // Number of *additional* vectors to orthogonalize against 
   F* alpha,                   // Output diagonal elements of T of size A.shape[1]+1
-  F* beta,                    // Output subdiagonal elements of T of size A.shape[1]+1
+  F* beta,                    // Output subdiagonal elements of T of size A.shape[1]+1; should be 0; 
   F* V,                       // Output matrix for Lanczos vectors (column-major)
   const size_t ncv            // Number of Lanczos vectors pre-allocated (must be at least 2)
 ){
@@ -69,7 +69,8 @@ void lanczos_recurrence(
   // Setup for first iteration
   std::array< int, 3 > pos = { int(ncv) - 1, 0, 1 };          // Indices for the recurrence
   Q.col(pos[0]) = static_cast< VectorF >(VectorF::Zero(n));   // Ensure previous is 0
-  Q.col(0) = v.normalized();                                  // load normalized v0 into Q  
+  Q.col(0) = v.normalized();                                  // Load unit-norm v as q0
+  beta[0] = 0.0;                                              // Ensure beta_0 is 0
 
   for (int j = 0; j < deg; ++j) {
 
@@ -118,8 +119,8 @@ auto lanczos_quadrature(
   const weight_method method = golub_welsch  // The method of computing the weights 
 ) -> void {
   using VectorF = Eigen::Array< F, Dynamic, 1>;
-
-  // Use Eigen to obtain eigenvalues + eigenvectors of tridiagonal
+  assert(beta[0] == 0.0);
+  
   Eigen::Map< const VectorF > a(alpha, k);        // diagonal elements
   Eigen::Map< const VectorF > b(beta+1, k-1);     // subdiagonal elements (offset by 1!)
 
@@ -138,14 +139,9 @@ auto lanczos_quadrature(
     solver.computeFromTridiagonal(a, b, Eigen::DecompositionOptions::EigenvaluesOnly);
     auto theta = static_cast< VectorF >(solver.eigenvalues()); // Rayleigh-Ritz values == nodes
     std::copy(theta.begin(), theta.end(), nodes);
-    
+
     // Compute weights via FTTR
-    const F mu_0 = theta.abs().sum();
-    const F mu_rec_sqrt = 1.0 / std::sqrt(mu_0);
-    auto tbl = std::vector< F >(k, 0.0);
-    for (int i = 0; i < k; ++i){
-      weights[i] = orth_poly_weight(theta[i], mu_rec_sqrt, alpha, beta, tbl.data(), k) / mu_0;
-    }
+    FTTR_weights(theta.data(), alpha, beta, k, weights);
   }
 };
 
diff --git a/include/_orthogonalize/orthogonalize.h b/include/_orthogonalize/orthogonalize.h
index bc207ad..90301b7 100644
--- a/include/_orthogonalize/orthogonalize.h
+++ b/include/_orthogonalize/orthogonalize.h
@@ -5,26 +5,72 @@
 // #include <cmath> // std::isnan
 #include "eigen_core.h" // DenseMatrix, EigenCore
 #include <Eigen/QR>
-
+#include <iomanip>
+#include <iostream>
 
 // Emulate python modulus behavior since C++ '%' is not a true modulus
 constexpr inline auto mod(int a, int b) noexcept -> int {
   return (b + (a % b)) % b; 
 }
 
+// template< std::floating_point F >
+// inline auto orth_poly_weight(const F x, const F mu_rec_sqrt, const F* a, const F* b, F* tbl, const int n) noexcept -> F { 
+//   // const auto mu_rec_sqrt = 1.0 / std::sqrt(mu)
+//   tbl[0] = mu_rec_sqrt;
+//   tbl[1] = (x - a[0]) * mu_rec_sqrt / b[1];
+//   F w = std::pow(tbl[0], 2) + std::pow(tbl[1], 2);
+//   for (int i = 2; i < n; ++i){ 
+//     std::cout << i << ": (((" << x << " - " << a[i-1] << ") * " << tbl[i-1] << ") - " <<  b[i-1] << " * " <<  tbl[i-2] << ") / " << b[i] << std::endl;
+//     // tbl[i] = (((x - a[i-1]) * tbl[i-1]) - b[i-1] * tbl[i-2]) / b[i];
+//     F s = (x - a[i-1]) / b[i];
+//     F t = -b[i-1] / b[i];
+//     tbl[i] = s * tbl[i-1] + t * tbl[i-2];
+//     w += std::pow(tbl[i], 2);
+//   }
+//   return 1.0 / w;
+// }
+
 template< std::floating_point F >
-inline auto orth_poly_weight(const F x, const F mu_rec_sqrt, const F* a, const F* b, F* tbl, const int n) noexcept -> F { 
-  // const auto mu_rec_sqrt = 1.0 / std::sqrt(mu)
-  tbl[0] = mu_rec_sqrt;
-  tbl[1] = (x - a[0]) * mu_rec_sqrt / b[1];
-  F w = std::pow(tbl[0], 2) + std::pow(tbl[1], 2);
-  for (int i = 2; i < n; ++i){ 
-    tbl[i] = (((x - a[i-1]) * tbl[i-1]) - b[i-1] * tbl[i-2]) / b[i];
-    w += std::pow(tbl[i], 2);
+inline void poly(F x, const F mu_sqrt_rec, const F* a, const F* b, F* z, const size_t n) noexcept {
+  // assert(z.size() == a.size());
+  z[0] = mu_sqrt_rec;
+  z[1] = (x - a[0]) * z[0] / b[1];
+  for (size_t i = 2; i < n; ++i) {
+    // F zi = ((x - a[i-1]) * z[i-1] - b[i-1] * z[i-2]) / b[i];
+    // Slightly more numerically stable way of doing the above
+    F s = (x - a[i-1]) / b[i];
+    F t = -b[i-1] / b[i];
+    z[i] = s * z[i-1] + t * z[i-2];
+    // std::cout << "(" << x << ") " << i << ": " << s  << " * " << z[i-1] << " + "  << t << " * " << z[i-2];
+    // std::cout << " -> " << z[i] << std::endl;
   }
-  return 1.0 / w;
 }
 
+template< std::floating_point F >
+void FTTR_weights(const F* theta, const F* alpha, const F* beta, const size_t k, F* weights) {
+  // assert(ew.size() == a.size());
+  const auto a = Eigen::Map< const Array< F > >(alpha, k);
+  const auto b = Eigen::Map< const Array< F > >(beta, k);
+  const auto ew = Eigen::Map< const Array< F > >(theta, k); 
+  const F mu_0 = ew.abs().sum();
+  const F mu_sqrt_rec = 1.0 / std::sqrt(mu_0);
+  // std::cout << std::fixed << std::showpoint;
+  // std::cout << std::setprecision(15);
+  // std::cout << "---- ACTUAL --- ( " << sizeof(F) << ")" << std::endl;
+  // std::cout << "a: " << a.matrix().transpose() << std::endl;
+  // std::cout << "b: " << b.matrix().transpose() << std::endl;
+  // std::cout << "ew: " << ew.matrix().transpose() << std::endl;
+  // std::cout << "mu_0: " << mu_0 << std::endl;
+  Array< F > p(a.size());
+  for (size_t i = 0; i < k; ++i){
+    poly(theta[i], mu_sqrt_rec, a.data(), b.data(), p.data(), a.size());
+    F weight = 1.0 / p.square().sum();
+    weights[i] = weight / mu_0; 
+    // std::cout << i << ": (x: " << theta[i] << ", w: " << weight << ") p: " << p.matrix().transpose() << std::endl;
+  }
+}
+
+
 // Orthogonalizes v with respect to columns in U via modified gram schmidt
 // Cyclically projects v onto the columns U[:,i:(i+p)] = u_i, u_{i+1}, ..., u_{i+p}, removing from v the components 
 // of the vector projections. If any index i, ..., i+p exceeds the number of columns of U, the indices are cycled. 
diff --git a/meson.build b/meson.build
index ae8e7be..19943bc 100644
--- a/meson.build
+++ b/meson.build
@@ -51,24 +51,24 @@ _cpp_args = compiler.get_supported_arguments(
 )
 
 ## Debug only 
-# _cpp_args += compiler.get_supported_arguments(
-#   '-O1',
-#   # '-fsanitize=address', 
-#   # '-fno-omit-frame-pointer',
-#   '-DNDEBUG',
-#   '-Wall'
-# )
-
-## Release only 
 _cpp_args += compiler.get_supported_arguments(
-  # '-flto=thin',
-  # '-flto', 
-  '-O3', 
+  '-O2',
+  # '-fsanitize=address', 
+  # '-fno-omit-frame-pointer',
   '-DNDEBUG',
-  '-Wl,-s',       # strip
-  # '-march=native' # either this or lto seems to not work on multiple builds
+  '-Wall'
 )
 
+## Release only 
+# _cpp_args += compiler.get_supported_arguments(
+#   # '-flto=thin',
+#   # '-flto', 
+#   '-O3', 
+#   '-DNDEBUG',
+#   '-Wl,-s',       # strip
+#   # '-march=native' # either this or lto seems to not work on multiple builds
+# )
+
 ## Require C++20 for concepts
 concept_code = '''
 #include <concepts>
diff --git a/src/primate/_lanczos.cpp b/src/primate/_lanczos.cpp
index a68fc3c..af157b4 100644
--- a/src/primate/_lanczos.cpp
+++ b/src/primate/_lanczos.cpp
@@ -78,13 +78,6 @@ void _lanczos_wrapper(py::module& m){
       alpha.mutable_data(), beta.mutable_data(), Q.mutable_data(), ncv
     );
   });
-  m.def("quadrature", [](py_array< F > a, py_array< F > b, const int k, const int method = 0) -> py_array< F > {
-    auto output = DenseMatrix< F >(k, 2); // [nodes, weights]
-    auto solver = Eigen::SelfAdjointEigenSolver< DenseMatrix< F > >(k);
-    const auto wm = static_cast< weight_method >(method);
-    lanczos_quadrature(a.data(), b.data(), k, solver, output.col(0).data(), output.col(1).data(), wm);
-    return py::cast(output); 
-  });
   // m.def("function_approx", [](
   //   const Matrix& A, 
   //   py_array< F > v, 
@@ -122,6 +115,14 @@ PYBIND11_MODULE(_lanczos, m) {
   
   _lanczos_wrapper< float, py::object, PyLinearOperator< float > >(m);
   _lanczos_wrapper< double, py::object, PyLinearOperator< double > >(m);
+
+  m.def("quadrature", [](py_array< double > a, py_array< double > b, const int k, const int method = 0) -> py_array< double > {
+    auto output = DenseMatrix< double >(k, 2); // [nodes, weights]
+    auto solver = Eigen::SelfAdjointEigenSolver< DenseMatrix< double > >(k);
+    const auto wm = static_cast< weight_method >(method);
+    lanczos_quadrature(a.data(), b.data(), k, solver, output.col(0).data(), output.col(1).data(), wm);
+    return py::cast(output); 
+  });
 };
 
 
diff --git a/src/primate/_orthogonalize.cpp b/src/primate/_orthogonalize.cpp
index d054b13..e6c0e81 100644
--- a/src/primate/_orthogonalize.cpp
+++ b/src/primate/_orthogonalize.cpp
@@ -2,6 +2,10 @@
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
 #include "_orthogonalize/orthogonalize.h"
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include <pybind11/eigen.h>
 
 namespace py = pybind11;
 
@@ -9,6 +13,8 @@ namespace py = pybind11;
 template< std::floating_point F > 
 using py_array = py::array_t< F, py::array::f_style | py::array::forcecast >;
 
+using std::vector; 
+
 // template< std::floating_point F >
 // auto orth_poly(F x, int i, F mu_sqrt, const F* a, const F* b, const int n) noexcept -> F {
 //   if (i < 0){ 
@@ -28,11 +34,48 @@ using py_array = py::array_t< F, py::array::f_style | py::array::forcecast >;
 // }
 
 
+// Compute weights via FTTR 
+// Debug update: Algorithm is correct, just unstable for lower precision types! 
+// Returns a vector p(x) of orthogonal polynomials representing the recurrence 
+// Jp(x) = x p(x) - \gamma_n p_n(x) e_n
+// where J is the Jacobi / tridiagonal matrix
+// Parameters: 
+//   x: value to evaluate polynomial at. Typically an eigenvalue. 
+//   mu_0: sum of the eigenvalues of J.
+//   a: diagonal values of J. 
+//   b: off-diagonal values of J
+//   z: output vector. 
+// Weights should be populated with mu_0 * U[0,:]**2
+template< std::floating_point F >
+auto fttr(const Array< F >& ew, const Array< F >& a, const Array< F >& b) -> Array< F > {
+  assert(ew.size() == a.size());
+  const F* theta = ew.data();
+  const size_t k = ew.size();
+  const F mu_0 = ew.abs().sum();
+  const F mu_sqrt_rec = 1.0 / std::sqrt(mu_0);
+  // std::cout << std::fixed << std::showpoint;
+  // std::cout << std::setprecision(15);
+  // std::cout << "---- TEST --- ( " << sizeof(F) << ")" << std::endl;
+  // std::cout << "a: " << a.matrix().transpose() << std::endl;
+  // std::cout << "b: " << b.matrix().transpose() << std::endl;
+  // std::cout << "ew: " << ew.matrix().transpose() << std::endl;
+  // std::cout << "mu_0: " << mu_0 << std::endl;
+  Array< F > p(a.size());
+  Array< F > weights(a.size());
+  for (size_t i = 0; i < k; ++i){
+    poly(theta[i], mu_sqrt_rec, a.data(), b.data(), p.data(), a.size());
+    F weight = 1.0 / p.square().sum();
+    // std::cout << i << ": (x: " << theta[i] << ", w: " << weight << ") p: " << p.matrix().transpose() << std::endl;
+    weights[i] = weight / mu_0; 
+  }
+  return weights; 
+}
+
 template< std::floating_point F >
 void _orthogonalize(py::module &m){
   m.def("mgs", &modified_gram_schmidt< F >);
   m.def("orth_vector", &orth_vector< F >);
-  // m.def("orth_poly", &orth_poly< F >);
+  m.def("fttr", &fttr< F >);
 }
 
 PYBIND11_MODULE(_orthogonalize, m) {
diff --git a/tests/sanity.py b/tests/sanity.py
index a668c20..1379d4e 100644
--- a/tests/sanity.py
+++ b/tests/sanity.py
@@ -76,20 +76,20 @@ def approx_matvec(A, v: np.ndarray, k: int = None, f: Callable = None):
   y = np.linalg.norm(v) * (Q @ V @ (V[0,:] * rw))
   return y
 
-def orthogonal_polynomial_value(x, k, theta, gamma):
-  # Initialize the polynomials p_{-1} and p_0
-  p_minus1 = 0
-  p_0 = 1.0 / np.sqrt(1)  # Since k_0 = 1 / sqrt(mu_0) and mu_0 = 1
+# def orthogonal_polynomial_value(x, k, theta, gamma):
+#   # Initialize the polynomials p_{-1} and p_0
+#   p_minus1 = 0
+#   p_0 = 1.0 / np.sqrt(1)  # Since k_0 = 1 / sqrt(mu_0) and mu_0 = 1
 
-  # Use recurrence relation to compute p_k for k > 0
-  p_k = 0.0
-  for ell in range(1, k + 1):
-    p_k = ((x - theta[ell - 1]) * p_0 - gamma[ell - 1] * p_minus1) / gamma[ell]
+#   # Use recurrence relation to compute p_k for k > 0
+#   p_k = 0.0
+#   for ell in range(1, k + 1):
+#     p_k = ((x - theta[ell - 1]) * p_0 - gamma[ell - 1] * p_minus1) / gamma[ell]
 
-    # Update values for next iteration
-    p_minus1 = p_0
-    p_0 = p_k
-  return p_k
+#     # Update values for next iteration
+#     p_minus1 = p_0
+#     p_0 = p_k
+#   return p_k
 
 # from scipy.sparse import spdiags
 # alpha = np.array([1,1,1])
@@ -123,22 +123,22 @@ def orthogonal_polynomial_value(x, k, theta, gamma):
 # b = np.append(0, beta[:-1])
 # mu_0 = np.sum(np.abs(ew))
 
-# def orth_poly(x: float, i: int, mu: float, a: np.ndarray, b: np.ndarray):
-#   # print(f"x: {x:.2f}, i: {i})")
-#   z = 0
-#   if i < 0: 
-#     z = 0
-#   elif i == 0: 
-#     z = 1 / np.sqrt(mu)
-#   elif i == 1: 
-#     z = (x - a[0]) * (1 / np.sqrt(mu)) / b[1]
-#   elif i < len(a): 
-#     z = (x - a[i-1]) * orth_poly(x, i - 1, mu, a, b) 
-#     z -= b[i-1] * orth_poly(x, i - 2, mu, a, b) 
-#     z /= b[i]
-#   else:
-#     z = 0
-#   return z
+def orth_poly(x: float, i: int, mu: float, a: np.ndarray, b: np.ndarray):
+  # print(f"x: {x:.2f}, i: {i})")
+  z = 0
+  if i < 0: 
+    z = 0
+  elif i == 0: 
+    z = 1 / np.sqrt(mu)
+  elif i == 1: 
+    z = (x - a[0]) * (1 / np.sqrt(mu)) / b[1]
+  elif i < len(a): 
+    z = (x - a[i-1]) * orth_poly(x, i - 1, mu, a, b) 
+    z -= b[i-1] * orth_poly(x, i - 2, mu, a, b) 
+    z /= b[i]
+  else:
+    z = 0
+  return z
 
 # mu_0 * np.ravel(ev[0,:])**2
 
diff --git a/tests/test_lanczos.py b/tests/test_lanczos.py
index 10c6746..6fe852a 100644
--- a/tests/test_lanczos.py
+++ b/tests/test_lanczos.py
@@ -119,6 +119,83 @@ def test_quadrature():
   assert np.allclose(nw_true[0], nw_test[:,0], atol=1e-6)
   assert np.allclose(nw_true[1], nw_test[:,1], atol=1e-6)
 
+def test_fttr_basic():
+  from scipy.sparse import spdiags
+  alpha = np.array([1,1,1])
+  beta = np.array([1,1,0])
+  T = spdiags(data=[beta, alpha, np.roll(beta, 1)], diags=(-1, 0, +1), m=3, n=3).todense()
+  ew, ev = np.linalg.eigh(T)
+
+  a = alpha 
+  b = np.append(0, beta[:-1])
+  mu_0 = np.sum(np.abs(ew))
+  p0 = lambda x: 1 / np.sqrt(mu_0)
+  p1 = lambda x: (x - a[0])*p0(x) / b[1]
+  p2 = lambda x: ((x - a[1])*p1(x) - b[1] * p0(x)) / b[2]
+  p = lambda x: np.array([p0(x), p1(x), p2(x)])
+  weights_fttr = np.reciprocal([np.sum(p(lam) ** 2) for lam in ew])
+
+  ## Forward three-term recurrence relation (fttr)
+  assert np.allclose(weights_fttr, mu_0 * np.ravel(ev[0,:])**2)
+
+def test_fttr_2():
+  from sanity import orth_poly
+  from scipy.sparse import spdiags
+  from primate.diagonalize import lanczos, _lanczos
+  np.random.seed(1234)
+  from scipy.linalg import toeplitz
+  n = 5
+  alpha = np.random.uniform(size=n, low=0, high=1)
+  beta = np.append(np.random.uniform(size=n-1, low=0, high=1), 0)
+  T = spdiags(data=[beta, alpha, np.roll(beta, 1)], diags=(-1, 0, +1), m=n, n=n).todense()
+  ew, ev = np.linalg.eigh(T)
+
+  ## Deduced as the FTTR algorothm from Theorem 1 of "Computing Gaussian quadrature rules with high relative accuracy"
+  a = alpha 
+  b = np.append(0, beta[:-1]) # first must be zero
+  mu_0 = np.sum(np.abs(ew))
+  p = lambda x: np.array([orth_poly(x, i, mu_0, a, b) for i in range(n)])
+  
+  weights_fttr = np.reciprocal([np.sum(p(lam) ** 2) for lam in ew])
+  assert np.allclose(weights_fttr, mu_0 * np.ravel(ev[0,:])**2)
+
+def test_fttr3():
+  from scipy.linalg import toeplitz
+  from scipy.sparse import spdiags
+  from primate.diagonalize import lanczos, _lanczos
+  np.random.seed(1234)
+  n = 8
+  A = toeplitz([0,1,2,3,4,5,6,7]).astype(np.float64) # symmetric(n)
+  v0 = np.random.uniform(size=A.shape[1])
+  alpha, beta = lanczos(A, v0=v0, deg=n, orth=n-1)
+  a, b = alpha, np.append([0], beta)
+  T = spdiags(data=[np.roll(b,-1), a, b], diags=(-1, 0, +1), m=n, n=n).todense()
+  # T = np.array([[4, 3, 0], [3, 1, 1], [0, 1, -1]])
+  ew, ev = np.linalg.eigh(T)
+  a, b = np.diag(T, 0), np.append([0], np.diag(T, 1))
+  mu_0 = np.sum(np.abs(ew))
+
+  # echo = lambda i, x, a1, z1, b1, z0, b2: print(f"{i}: (({x} - {a1}) * {z1} - {b1} * {z0}) / {b2}")
+  # def p(x, mu_0, a, b, verbose: bool = True):
+  #   z0 = 1 / np.sqrt(mu_0)
+  #   z1 = (x - a[0]) * z0 / b[1]
+  #   if verbose:
+  #     echo(1, x, a[0], z0, b[1], 0, 0)
+  #   z = [z0,z1]
+  #   for i in range(2, len(a)):
+  #     zi = ((x - a[i-1]) * z[i-1] - b[i-1] * z[i-2]) / b[i]
+  #     z += [zi]
+  #     if verbose: 
+  #       echo(i, x, a[i-1], z[i-1], b[i-1], z[i-2], b[i])
+  #   return np.array(z)
+
+  from primate.ortho import _orthogonalize
+  fttr_weights_base = _orthogonalize.fttr(ew, a, b)
+  fttr_weights_run = _lanczos.quadrature(a,b,len(a),1)[:,1]
+  fttr_weights_true = np.ravel(ev[0,:])**2
+  assert np.allclose(fttr_weights_base, fttr_weights_run)
+  assert np.allclose(fttr_weights_base, fttr_weights_true)
+
 def test_quadrature_methods():
   from primate.diagonalize import _lanczos
   n = 3
@@ -138,6 +215,7 @@ def test_quadrature_methods():
   quad2 = np.sum(_lanczos.quadrature(a, b, n, 1).prod(axis=1))
   assert np.isclose(quad1, quad2, atol=quad1*0.05)
 
+
 # def test_quadrature_toeplitz():
 #   from primate.diagonalize import lanczos, _lanczos
 #   from scipy.linalg import toeplitz