Skip to content

Commit

Permalink
Add RNG to random projection structs
Browse files Browse the repository at this point in the history
RNG defaults to Xoshiro256Plus if not provided by user.
Also added tests for minimum dimension using values from scikit-learn.
  • Loading branch information
GBathie committed Mar 1, 2024
1 parent 1999d0f commit 3569c10
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 39 deletions.
1 change: 1 addition & 0 deletions algorithms/linfa-reduction/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ rand = { version = "0.8", features = ["small_rng"] }
linfa = { version = "0.7.0", path = "../.." }
linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
sprs = "0.11.1"
rand_xoshiro = "0.6.0"

[dev-dependencies]
ndarray-npy = { version = "0.8", default-features = false }
Expand Down
8 changes: 4 additions & 4 deletions algorithms/linfa-reduction/examples/gaussian_projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@ use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::thread_rng;
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
Expand Down Expand Up @@ -54,10 +56,8 @@ fn main() -> Result<(), Box<dyn Error>> {
println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let rng = thread_rng();
let proj = GaussianRandomProjection::<f32>::params()
let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.with_rng(rng)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
let reduced_test_data = proj.transform(&test_data);
Expand Down
5 changes: 4 additions & 1 deletion algorithms/linfa-reduction/examples/sparse_projection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ use linfa_trees::{DecisionTree, SplitQuality};

use mnist::{MnistBuilder, NormalizedMnist};
use ndarray::{Array1, Array2};
use rand::SeedableRng;
use rand_xoshiro::Xoshiro256Plus;

/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
fn main() -> Result<(), Box<dyn Error>> {
// Parameters
let train_sz = 10_000usize;
let test_sz = 1_000usize;
let reduced_dim = 100;
let rng = Xoshiro256Plus::seed_from_u64(42);

let NormalizedMnist {
trn_img,
Expand Down Expand Up @@ -53,7 +56,7 @@ fn main() -> Result<(), Box<dyn Error>> {
println!("Training reduced model...");
let start = Instant::now();
// Compute the random projection and train the model on the reduced dataset.
let proj = SparseRandomProjection::<f32>::params()
let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
.target_dim(reduced_dim)
.fit(&train_dataset)?;
let reduced_train_ds = proj.transform(&train_dataset);
Expand Down
33 changes: 30 additions & 3 deletions algorithms/linfa-reduction/src/random_projection/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,34 @@
/// - [D. Achlioptas, JCSS](https://www.sciencedirect.com/science/article/pii/S0022000003000254)
/// - [Li et al., SIGKDD'06](https://hastie.su.domains/Papers/Ping/KDD06_rp.pdf)
pub(crate) fn johnson_lindenstrauss_min_dim(n_samples: usize, eps: f64) -> usize {
let log_samples = (n_samples as f64).log2();
let value = 4. * log_samples * (eps.powi(2) / 2. - eps.powi(3) / 3.);
value.ceil() as usize
let log_samples = (n_samples as f64).ln();
let value = 4. * log_samples / (eps.powi(2) / 2. - eps.powi(3) / 3.);
value as usize
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
/// Test against values computed by the scikit-learn implementation
/// of `johnson_lindenstrauss_min_dim`.
fn test_johnson_lindenstrauss() {
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.05), 15244);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.1), 3947);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.2), 1062);
assert_eq!(johnson_lindenstrauss_min_dim(100, 0.5), 221);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.05), 22867);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.1), 5920);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.2), 1594);
assert_eq!(johnson_lindenstrauss_min_dim(1000, 0.5), 331);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.05), 28194);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.1), 7300);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.2), 1965);
assert_eq!(johnson_lindenstrauss_min_dim(5000, 0.5), 408);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.05), 30489);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.1), 7894);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.2), 2125);
assert_eq!(johnson_lindenstrauss_min_dim(10000, 0.5), 442);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ use ndarray_rand::{
rand_distr::{Normal, StandardNormal},
RandomExt,
};
use rand::{prelude::Distribution, rngs::SmallRng, Rng};
use rand::{prelude::Distribution, Rng, SeedableRng};
use rand_xoshiro::Xoshiro256Plus;

use super::super::common::johnson_lindenstrauss_min_dim;
use super::hyperparams::GaussianRandomProjectionParamsInner;
Expand All @@ -28,6 +29,7 @@ where
fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
let n_samples = dataset.nsamples();
let n_features = dataset.nfeatures();
let mut rng = self.rng.clone();

let n_dims = match &self.params {
GaussianRandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
Expand All @@ -39,22 +41,31 @@ where
let std_dev = F::cast(n_features).sqrt().recip();
let gaussian = Normal::new(F::zero(), std_dev)?;

let proj = match self.rng.clone() {
Some(mut rng) => Array::random_using((n_features, n_dims), gaussian, &mut rng),
None => Array::random((n_features, n_dims), gaussian),
};
let proj = Array::random_using((n_features, n_dims), gaussian, &mut rng);

Ok(GaussianRandomProjection { projection: proj })
}
}

impl<F: Float> GaussianRandomProjection<F> {
/// Create new parameters for a [`GaussianRandomProjection`] with default value
/// `precision = 0.1` and a [`Xoshiro256Plus`] RNG.
pub fn params() -> GaussianRandomProjectionParams<Xoshiro256Plus> {
GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
params: GaussianRandomProjectionParamsInner::Precision { precision: 0.1 },
rng: Xoshiro256Plus::seed_from_u64(42),
})
}

/// Create new parameters for a [`GaussianRandomProjection`] with default values
/// `precision = 0.1` and no custom [`Rng`] provided.
pub fn params() -> GaussianRandomProjectionParams<SmallRng> {
/// `precision = 0.1` and the provided [`Rng`].
pub fn params_with_rng<R>(rng: R) -> GaussianRandomProjectionParams<R>
where
R: Rng + Clone,
{
GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
params: GaussianRandomProjectionParamsInner::Precision { precision: 0.1 },
rng: None,
rng,
})
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,10 @@ impl<R: Rng + Clone> GaussianRandomProjectionParams<R> {
}

/// Specify the random number generator to use to generate the projection matrix.
///
/// Optional: if no RNG is specified, uses the default RNG in [ndarray_rand::RandomExt].
pub fn with_rng<R2: Rng + Clone>(self, rng: R2) -> GaussianRandomProjectionParams<R2> {
GaussianRandomProjectionParams(GaussianRandomProjectionValidParams {
params: self.0.params,
rng: Some(rng),
rng,
})
}
}
Expand All @@ -68,7 +66,7 @@ impl<R: Rng + Clone> GaussianRandomProjectionParams<R> {
#[derive(Debug, Clone, PartialEq)]
pub struct GaussianRandomProjectionValidParams<R: Rng + Clone> {
pub(super) params: GaussianRandomProjectionParamsInner,
pub(super) rng: Option<R>,
pub(super) rng: R,
}

/// Internal data structure that either holds the dimension or the embedding,
Expand Down Expand Up @@ -107,8 +105,8 @@ impl<R: Rng + Clone> GaussianRandomProjectionValidParams<R> {
self.params.eps()
}

pub fn rng(&self) -> Option<&R> {
self.rng.as_ref()
pub fn rng(&self) -> &R {
&self.rng
}
}

Expand Down
11 changes: 6 additions & 5 deletions algorithms/linfa-reduction/src/random_projection/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,24 @@ pub use sparse::{
#[cfg(test)]
mod tests {
use super::*;
use rand::rngs::SmallRng;

use rand_xoshiro::Xoshiro256Plus;

#[test]
fn autotraits_gaussian() {
fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
has_autotraits::<GaussianRandomProjection<f64>>();
has_autotraits::<GaussianRandomProjection<f32>>();
has_autotraits::<GaussianRandomProjectionValidParams<SmallRng>>();
has_autotraits::<GaussianRandomProjectionParams<SmallRng>>();
has_autotraits::<GaussianRandomProjectionValidParams<Xoshiro256Plus>>();
has_autotraits::<GaussianRandomProjectionParams<Xoshiro256Plus>>();
}

#[test]
fn autotraits_sparse() {
fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
has_autotraits::<SparseRandomProjection<f64>>();
has_autotraits::<SparseRandomProjection<f32>>();
has_autotraits::<SparseRandomProjectionValidParams>();
has_autotraits::<SparseRandomProjectionParams>();
has_autotraits::<SparseRandomProjectionValidParams<Xoshiro256Plus>>();
has_autotraits::<SparseRandomProjectionParams<Xoshiro256Plus>>();
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/// Macro that implements [`linfa::traits::Transformer`]
/// for [`GaussianRandomProjection`] and [`SparseRandomProjection`],
/// for [`super::GaussianRandomProjection`] and [`super::SparseRandomProjection`],
/// to avoid some code duplication.
#[macro_export]
macro_rules! impl_proj {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
use linfa::{prelude::Records, traits::Fit, Float};
use ndarray::Ix2;
use ndarray_rand::rand_distr::StandardNormal;
use rand::{distributions::Bernoulli, prelude::Distribution, thread_rng, Rng};
use rand::SeedableRng;
use rand::{distributions::Bernoulli, prelude::Distribution, Rng};
use rand_xoshiro::Xoshiro256Plus;
use sprs::{CsMat, TriMat};

use super::super::common::johnson_lindenstrauss_min_dim;
Expand All @@ -14,17 +16,19 @@ pub struct SparseRandomProjection<F: Float> {
projection: CsMat<F>,
}

impl<F, Rec, T> Fit<Rec, T, ReductionError> for SparseRandomProjectionValidParams
impl<F, Rec, T, R> Fit<Rec, T, ReductionError> for SparseRandomProjectionValidParams<R>
where
F: Float,
Rec: Records<Elem = F>,
StandardNormal: Distribution<F>,
R: Rng + Clone,
{
type Object = SparseRandomProjection<F>;

fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
let n_samples = dataset.nsamples();
let n_features = dataset.nfeatures();
let mut rng = self.rng.clone();

let n_dims = match &self.params {
SparseRandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
Expand All @@ -36,7 +40,6 @@ where
let scale = (n_features as f64).sqrt();
let p = 1f64 / scale;
let dist = SparseDistribution::new(F::cast(scale), p);
let mut rng = thread_rng();

let (mut row_inds, mut col_inds, mut values) = (Vec::new(), Vec::new(), Vec::new());
for row in 0..n_features {
Expand Down Expand Up @@ -90,10 +93,23 @@ impl<F: Float> Distribution<Option<F>> for SparseDistribution<F> {

impl<F: Float> SparseRandomProjection<F> {
/// Create new parameters for a [`SparseRandomProjection`] with default value
/// `precision = 0.1`.
pub fn params() -> SparseRandomProjectionParams {
/// `precision = 0.1` and a [`Xoshiro256Plus`] RNG.
pub fn params() -> SparseRandomProjectionParams<Xoshiro256Plus> {
SparseRandomProjectionParams(SparseRandomProjectionValidParams {
params: SparseRandomProjectionParamsInner::Precision { precision: 0.1 },
rng: Xoshiro256Plus::seed_from_u64(42),
})
}

/// Create new parameters for a [`SparseRandomProjection`] with default values
/// `precision = 0.1` and the provided [`Rng`].
pub fn params_with_rng<R>(rng: R) -> SparseRandomProjectionParams<R>
where
R: Rng + Clone,
{
SparseRandomProjectionParams(SparseRandomProjectionValidParams {
params: SparseRandomProjectionParamsInner::Precision { precision: 0.1 },
rng,
})
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::fmt::Debug;

use linfa::ParamGuard;
use rand::Rng;

use crate::ReductionError;

Expand All @@ -13,9 +14,11 @@ use crate::ReductionError;
/// However, this lemma makes a very conservative estimate of the required dimension,
/// and does not leverage the structure of the data, therefore it is also possible
/// to manually specify the dimension of the embedding.
pub struct SparseRandomProjectionParams(pub(crate) SparseRandomProjectionValidParams);
pub struct SparseRandomProjectionParams<R: Rng + Clone>(
pub(crate) SparseRandomProjectionValidParams<R>,
);

impl SparseRandomProjectionParams {
impl<R: Rng + Clone> SparseRandomProjectionParams<R> {
/// Set the dimension of output of the embedding.
///
/// Setting the target dimension with this function
Expand All @@ -35,6 +38,14 @@ impl SparseRandomProjectionParams {

self
}

/// Specify the random number generator to use to generate the projection matrix.
pub fn with_rng<R2: Rng + Clone>(self, rng: R2) -> SparseRandomProjectionParams<R2> {
SparseRandomProjectionParams(SparseRandomProjectionValidParams {
params: self.0.params,
rng,
})
}
}

/// Sparse random projection hyperparameters
Expand All @@ -47,8 +58,9 @@ impl SparseRandomProjectionParams {
/// and does not leverage the structure of the data, therefore it is also possible
/// to manually specify the dimension of the embedding.
#[derive(Debug, Clone, PartialEq)]
pub struct SparseRandomProjectionValidParams {
pub struct SparseRandomProjectionValidParams<R> {
pub(super) params: SparseRandomProjectionParamsInner,
pub(super) rng: R,
}

/// Internal data structure that either holds the dimension or the embedding,
Expand Down Expand Up @@ -78,18 +90,22 @@ impl SparseRandomProjectionParamsInner {
}
}

impl SparseRandomProjectionValidParams {
impl<R: Rng + Clone> SparseRandomProjectionValidParams<R> {
pub fn target_dim(&self) -> Option<usize> {
self.params.target_dim()
}

pub fn precision(&self) -> Option<f64> {
self.params.eps()
}

pub fn rng(&self) -> &R {
&self.rng
}
}

impl ParamGuard for SparseRandomProjectionParams {
type Checked = SparseRandomProjectionValidParams;
impl<R: Rng + Clone> ParamGuard for SparseRandomProjectionParams<R> {
type Checked = SparseRandomProjectionValidParams<R>;
type Error = ReductionError;

fn check_ref(&self) -> Result<&Self::Checked, Self::Error> {
Expand Down

0 comments on commit 3569c10

Please sign in to comment.