Feat: Implement Random Projections (#332)

* Add random projections algorithms for dimensionality reduction. Contains two algorithms based on variants on the Johnson-lindenstrauss lemma: - Random projections with Gaussian coefficients - Sparse random projections with +/- 1 coefficients (multiplied by a scaling factor). * Update readme * Add RNG to random projection structs RNG defaults to Xoshiro256Plus if not provided by user. Also added tests for minimum dimension using values from scikit-learn. * Check that random projections actually reduce the dimension of the data. * Use fixed dimension in error tests * Refactor random projections code
rust-ml · Mar 30, 2024 · 2eaa686 · 2eaa686
1 parent a29abe8
commit 2eaa686
Show file tree

Hide file tree

Showing 11 changed files with 726 additions and 1 deletion.
diff --git a/algorithms/linfa-reduction/Cargo.toml b/algorithms/linfa-reduction/Cargo.toml
@@ -1,7 +1,10 @@
 [package]
 name = "linfa-reduction"
 version = "0.7.0"
-authors = ["Lorenz Schmidt <[email protected]>"]
+authors = [
+ "Lorenz Schmidt <[email protected]>",
+ "Gabriel Bathie <[email protected]>",
+]
 description = "A collection of dimensionality reduction techniques"
 edition = "2018"
 license = "MIT OR Apache-2.0"
@@ -41,6 +44,8 @@ rand = { version = "0.8", features = ["small_rng"] }
 
 linfa = { version = "0.7.0", path = "../.." }
 linfa-kernel = { version = "0.7.0", path = "../linfa-kernel" }
+sprs = "0.11.1"
+rand_xoshiro = "0.6.0"
 
 [dev-dependencies]
 ndarray-npy = { version = "0.8", default-features = false }
@@ -49,3 +54,5 @@ linfa-datasets = { version = "0.7.0", path = "../../datasets", features = [
  "generate",
 ] }
 approx = { version = "0.4" }
+mnist = { version = "0.6.0", features = ["download"] }
+linfa-trees = { version = "0.7.0", path = "../linfa-trees"}
diff --git a/algorithms/linfa-reduction/README.md b/algorithms/linfa-reduction/README.md
@@ -11,6 +11,8 @@
 `linfa-reduction` currently provides an implementation of the following dimensional reduction methods: 
 - Diffusion Mapping
 - Principal Component Analysis (PCA)
+- Gaussian random projections
+- Sparse random projections
 
 ## Examples
 
@@ -19,6 +21,8 @@ There is an usage example in the `examples/` directory. To run, use:
 ```bash
 $ cargo run --release --example diffusion_map
 $ cargo run --release --example pca
+$ cargo run --release --example gaussian_projection
+$ cargo run --release --example sparse_projection
 ```
 
 ## BLAS/LAPACK backend

diff --git a/algorithms/linfa-reduction/examples/gaussian_projection.rs b/algorithms/linfa-reduction/examples/gaussian_projection.rs
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::GaussianRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+ // Parameters
+ let train_sz = 10_000usize;
+ let test_sz = 1_000usize;
+ let reduced_dim = 100;
+ let rng = Xoshiro256Plus::seed_from_u64(42);
+
+ let NormalizedMnist {
+ trn_img,
+ trn_lbl,
+ tst_img,
+ tst_lbl,
+ ..
+ } = MnistBuilder::new()
+ .label_format_digit()
+ .training_set_length(train_sz as u32)
+ .test_set_length(test_sz as u32)
+ .download_and_extract()
+ .finalize()
+ .normalize();
+
+ let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+ let train_labels: Array1<usize> =
+ Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+ let train_dataset = Dataset::new(train_data, train_labels);
+
+ let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+ let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+ let params = DecisionTree::params()
+ .split_quality(SplitQuality::Gini)
+ .max_depth(Some(10));
+
+ println!("Training non-reduced model...");
+ let start = Instant::now();
+ let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+ let end = start.elapsed();
+ let pred_y = model.predict(&test_data);
+ let cm = pred_y.confusion_matrix(&test_labels)?;
+ println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+ println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+ println!("Training reduced model...");
+ let start = Instant::now();
+ // Compute the random projection and train the model on the reduced dataset.
+ let proj = GaussianRandomProjection::<f32>::params_with_rng(rng)
+ .target_dim(reduced_dim)
+ .fit(&train_dataset)?;
+ let reduced_train_ds = proj.transform(&train_dataset);
+ let reduced_test_data = proj.transform(&test_data);
+ let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+ let end = start.elapsed();
+ let pred_reduced = model_reduced.predict(&reduced_test_data);
+ let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+ println!(
+ "Reduced model precision: {}%",
+ 100.0 * cm_reduced.accuracy()
+ );
+ println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+ Ok(())
+}
diff --git a/algorithms/linfa-reduction/examples/sparse_projection.rs b/algorithms/linfa-reduction/examples/sparse_projection.rs
@@ -0,0 +1,76 @@
+use std::{error::Error, time::Instant};
+
+use linfa::prelude::*;
+use linfa_reduction::random_projection::SparseRandomProjection;
+use linfa_trees::{DecisionTree, SplitQuality};
+
+use mnist::{MnistBuilder, NormalizedMnist};
+use ndarray::{Array1, Array2};
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256Plus;
+
+/// Train a Decision tree on the MNIST data set, with and without dimensionality reduction.
+fn main() -> Result<(), Box<dyn Error>> {
+ // Parameters
+ let train_sz = 10_000usize;
+ let test_sz = 1_000usize;
+ let reduced_dim = 100;
+ let rng = Xoshiro256Plus::seed_from_u64(42);
+
+ let NormalizedMnist {
+ trn_img,
+ trn_lbl,
+ tst_img,
+ tst_lbl,
+ ..
+ } = MnistBuilder::new()
+ .label_format_digit()
+ .training_set_length(train_sz as u32)
+ .test_set_length(test_sz as u32)
+ .download_and_extract()
+ .finalize()
+ .normalize();
+
+ let train_data = Array2::from_shape_vec((train_sz, 28 * 28), trn_img)?;
+ let train_labels: Array1<usize> =
+ Array1::from_shape_vec(train_sz, trn_lbl)?.map(|x| *x as usize);
+ let train_dataset = Dataset::new(train_data, train_labels);
+
+ let test_data = Array2::from_shape_vec((test_sz, 28 * 28), tst_img)?;
+ let test_labels: Array1<usize> = Array1::from_shape_vec(test_sz, tst_lbl)?.map(|x| *x as usize);
+
+ let params = DecisionTree::params()
+ .split_quality(SplitQuality::Gini)
+ .max_depth(Some(10));
+
+ println!("Training non-reduced model...");
+ let start = Instant::now();
+ let model: DecisionTree<f32, usize> = params.fit(&train_dataset)?;
+
+ let end = start.elapsed();
+ let pred_y = model.predict(&test_data);
+ let cm = pred_y.confusion_matrix(&test_labels)?;
+ println!("Non-reduced model precision: {}%", 100.0 * cm.accuracy());
+ println!("Training time: {:.2}s\n", end.as_secs_f32());
+
+ println!("Training reduced model...");
+ let start = Instant::now();
+ // Compute the random projection and train the model on the reduced dataset.
+ let proj = SparseRandomProjection::<f32>::params_with_rng(rng)
+ .target_dim(reduced_dim)
+ .fit(&train_dataset)?;
+ let reduced_train_ds = proj.transform(&train_dataset);
+ let reduced_test_data = proj.transform(&test_data);
+ let model_reduced: DecisionTree<f32, usize> = params.fit(&reduced_train_ds)?;
+
+ let end = start.elapsed();
+ let pred_reduced = model_reduced.predict(&reduced_test_data);
+ let cm_reduced = pred_reduced.confusion_matrix(&test_labels)?;
+ println!(
+ "Reduced model precision: {}%",
+ 100.0 * cm_reduced.accuracy()
+ );
+ println!("Reduction + training time: {:.2}s", end.as_secs_f32());
+
+ Ok(())
+}
diff --git a/algorithms/linfa-reduction/src/error.rs b/algorithms/linfa-reduction/src/error.rs
@@ -18,4 +18,12 @@ pub enum ReductionError {
  LinalgError(#[from] linfa_linalg::LinalgError),
  #[error(transparent)]
  LinfaError(#[from] linfa::error::Error),
+ #[error(transparent)]
+ NdarrayRandError(#[from] ndarray_rand::rand_distr::NormalError),
+ #[error("Precision parameter must be in the interval (0; 1)")]
+ InvalidPrecision,
+ #[error("Target dimension of the projection must be positive")]
+ NonPositiveEmbeddingSize,
+ #[error("Target dimension {0} is larger than the number of features {1}.")]
+ DimensionIncrease(usize, usize),
 }
diff --git a/algorithms/linfa-reduction/src/lib.rs b/algorithms/linfa-reduction/src/lib.rs
@@ -6,6 +6,7 @@ extern crate ndarray;
 mod diffusion_map;
 mod error;
 mod pca;
+pub mod random_projection;
 pub mod utils;
 
 pub use diffusion_map::{DiffusionMap, DiffusionMapParams, DiffusionMapValidParams};

diff --git a/algorithms/linfa-reduction/src/random_projection/algorithms.rs b/algorithms/linfa-reduction/src/random_projection/algorithms.rs
@@ -0,0 +1,169 @@
+use std::marker::PhantomData;
+
+use linfa::{
+ dataset::{AsTargets, FromTargetArray},
+ prelude::Records,
+ traits::{Fit, Transformer},
+ DatasetBase, Float,
+};
+use ndarray::{linalg::Dot, Array2, ArrayBase, Data, Ix2};
+
+use rand::{prelude::Distribution, Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256Plus;
+
+use super::hyperparams::RandomProjectionParamsInner;
+use super::{common::johnson_lindenstrauss_min_dim, methods::ProjectionMethod};
+use super::{RandomProjectionParams, RandomProjectionValidParams};
+use crate::ReductionError;
+
+/// Embedding via random projection
+pub struct RandomProjection<Proj: ProjectionMethod, F: Float>
+where
+ Proj::RandomDistribution: Distribution<F>,
+{
+ projection: Proj::ProjectionMatrix<F>,
+}
+
+impl<F, Proj, Rec, T, R> Fit<Rec, T, ReductionError> for RandomProjectionValidParams<Proj, R>
+where
+ F: Float,
+ Proj: ProjectionMethod,
+ Rec: Records<Elem = F>,
+ R: Rng + Clone,
+ Proj::RandomDistribution: Distribution<F>,
+{
+ type Object = RandomProjection<Proj, F>;
+
+ fn fit(&self, dataset: &linfa::DatasetBase<Rec, T>) -> Result<Self::Object, ReductionError> {
+ let n_samples = dataset.nsamples();
+ let n_features = dataset.nfeatures();
+ let mut rng = self.rng.clone();
+
+ let n_dims = match &self.params {
+ RandomProjectionParamsInner::Dimension { target_dim } => *target_dim,
+ RandomProjectionParamsInner::Epsilon { eps } => {
+ johnson_lindenstrauss_min_dim(n_samples, *eps)
+ }
+ };
+
+ if n_dims > n_features {
+ return Err(ReductionError::DimensionIncrease(n_dims, n_features));
+ }
+
+ let projection = Proj::generate_matrix(n_features, n_dims, &mut rng)?;
+
+ Ok(RandomProjection { projection })
+ }
+}
+
+impl<Proj: ProjectionMethod, F: Float> RandomProjection<Proj, F>
+where
+ Proj::RandomDistribution: Distribution<F>,
+{
+ /// Create new parameters for a [`RandomProjection`] with default value
+ /// `eps = 0.1` and a [`Xoshiro256Plus`] RNG.
+ pub fn params() -> RandomProjectionParams<Proj, Xoshiro256Plus> {
+ RandomProjectionParams(RandomProjectionValidParams {
+ params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
+ rng: Xoshiro256Plus::seed_from_u64(42),
+ marker: PhantomData,
+ })
+ }
+
+ /// Create new parameters for a [`RandomProjection`] with default values
+ /// `eps = 0.1` and the provided [`Rng`].
+ pub fn params_with_rng<R>(rng: R) -> RandomProjectionParams<Proj, R>
+ where
+ R: Rng + Clone,
+ {
+ RandomProjectionParams(RandomProjectionValidParams {
+ params: RandomProjectionParamsInner::Epsilon { eps: 0.1 },
+ rng,
+ marker: PhantomData,
+ })
+ }
+}
+
+impl<Proj, F, D> Transformer<&ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
+where
+ Proj: ProjectionMethod,
+ F: Float,
+ D: Data<Elem = F>,
+ ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+ Proj::RandomDistribution: Distribution<F>,
+{
+ /// Compute the embedding of a two-dimensional array
+ fn transform(&self, x: &ArrayBase<D, Ix2>) -> Array2<F> {
+ x.dot(&self.projection)
+ }
+}
+
+impl<Proj, F, D> Transformer<ArrayBase<D, Ix2>, Array2<F>> for RandomProjection<Proj, F>
+where
+ Proj: ProjectionMethod,
+ F: Float,
+ D: Data<Elem = F>,
+ ArrayBase<D, Ix2>: Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+ Proj::RandomDistribution: Distribution<F>,
+{
+ /// Compute the embedding of a two-dimensional array
+ fn transform(&self, x: ArrayBase<D, Ix2>) -> Array2<F> {
+ self.transform(&x)
+ }
+}
+
+impl<Proj, F, T> Transformer<DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T>>
+ for RandomProjection<Proj, F>
+where
+ Proj: ProjectionMethod,
+ F: Float,
+ T: AsTargets,
+ for<'a> ArrayBase<ndarray::ViewRepr<&'a F>, Ix2>:
+ Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+ Proj::RandomDistribution: Distribution<F>,
+{
+ /// Compute the embedding of a dataset
+ ///
+ /// # Parameter
+ ///
+ /// * `data`: a dataset
+ ///
+ /// # Returns
+ ///
+ /// New dataset, with data equal to the embedding of the input data
+ fn transform(&self, data: DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T> {
+ let new_records = self.transform(data.records().view());
+
+ DatasetBase::new(new_records, data.targets)
+ }
+}
+
+impl<'a, Proj, F, L, T> Transformer<&'a DatasetBase<Array2<F>, T>, DatasetBase<Array2<F>, T::View>>
+ for RandomProjection<Proj, F>
+where
+ Proj: ProjectionMethod,
+ F: Float,
+ L: 'a,
+ T: AsTargets<Elem = L> + FromTargetArray<'a>,
+ for<'b> ArrayBase<ndarray::ViewRepr<&'b F>, Ix2>:
+ Dot<Proj::ProjectionMatrix<F>, Output = Array2<F>>,
+ Proj::RandomDistribution: Distribution<F>,
+{
+ /// Compute the embedding of a dataset
+ ///
+ /// # Parameter
+ ///
+ /// * `data`: a dataset
+ ///
+ /// # Returns
+ ///
+ /// New dataset, with data equal to the embedding of the input data
+ fn transform(&self, data: &'a DatasetBase<Array2<F>, T>) -> DatasetBase<Array2<F>, T::View> {
+ let new_records = self.transform(data.records().view());
+
+ DatasetBase::new(
+ new_records,
+ T::new_targets_view(AsTargets::as_targets(data)),
+ )
+ }
+}