rust-ml · jk1015 · Jul 7, 2022 · Aug 25, 2022 · Sep 5, 2022 · Sep 26, 2022
diff --git a/algorithms/linfa-ensemble/Cargo.toml b/algorithms/linfa-ensemble/Cargo.toml
@@ -0,0 +1,36 @@
+[package]
+name = "linfa-ensemble"
+version = "0.6.0"
+edition = "2018"
+authors = ["James Knight <[email protected]>", "James Kay <[email protected]>"]
+description = "A general method for creating ensemble classifiers"
+license = "MIT/Apache-2.0"
+
+repository = "https://github.com/rust-ml/linfa"
+readme = "README.md"
+
+keywords = ["machine-learning", "linfa", "ensemble"]
+categories = ["algorithms", "mathematics", "science"]
+
+[features]
+default = []
+serde = ["serde_crate", "ndarray/serde"]
+
+[dependencies.serde_crate]
+package = "serde"
+optional = true
+version = "1.0"
+default-features = false
+features = ["std", "derive"]
+
+[dependencies]
+ndarray = { version = "0.15" , features = ["rayon", "approx"]}
+ndarray-rand = "0.14"
+rand = "0.8.5"
+
+linfa = { version = "0.6.0", path = "../.." }
+linfa-trees = { version = "0.6.0", path = "../linfa-trees"}
+
+[dev-dependencies]
+linfa-datasets = { version = "0.6.0", path = "../../datasets/", features = ["iris"] }
+
diff --git a/algorithms/linfa-ensemble/README.md b/algorithms/linfa-ensemble/README.md
@@ -0,0 +1,21 @@
+# Enseble Learning
+
+`linfa-ensemble` provides pure Rust implementations of Ensemble Learning algorithms for the Linfa toolkit.
+
+## The Big Picture
+
+`linfa-ensemble` is a crate in the [`linfa`](https://crates.io/crates/linfa) ecosystem, an effort to create a toolkit for classical Machine Learning implemented in pure Rust, akin to Python's `scikit-learn`.
+
+## Current state
+
+`linfa-ensemble` currently provides an implementation of bootstrap aggregation (bagging) for other classifers provided in linfa.
+
+## Examples
+
+You can find examples in the `examples/` directory. To run an bootstrap aggregation for ensemble of decision trees (a Random Forest) use:
+
+```bash
+$ cargo run --example randomforest_iris --release
+```
+
+
diff --git a/algorithms/linfa-ensemble/examples/randomforest_iris.rs b/algorithms/linfa-ensemble/examples/randomforest_iris.rs
@@ -0,0 +1,38 @@
+use linfa::prelude::{Fit, Predict, ToConfusionMatrix};
+use linfa_ensemble::{EnsembleLearnerParams};
+use linfa_trees::DecisionTree;
+use ndarray_rand::rand::SeedableRng;
+use rand::rngs::SmallRng;
+
+fn main() {
+ //Number of models in the ensemble
+ let ensemble_size = 100;
+ //Proportion of training data given to each model
+ let bootstrap_proportion = 0.7;
+
+ //Create ensemble learner
+ let mut learner = EnsembleLearnerParams::new();
+ learner
+ .ensemble_size(ensemble_size)
+ .bootstrap_proportion(bootstrap_proportion)
+ .model_params(DecisionTree::params());
+
+ //Load dataset
+ let mut rng = SmallRng::seed_from_u64(42);
+ let (train, test) = linfa_datasets::iris()
+ .shuffle(&mut rng)
+ .split_with_ratio(0.8);
+
+ //Train ensemble learner model
+ let model = learner.fit(&train).unwrap();
+
+ //Return highest ranking predictions
+ let final_predictions_ensemble = model.predict(&test);
+ println!("Final Predictions: \n{:?}", final_predictions_ensemble);
+
+ let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();
+
+ println!("{:?}", cm);
+ println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}",
+ 100.0 * cm.accuracy(), ensemble_size, bootstrap_proportion);
+}
diff --git a/algorithms/linfa-ensemble/src/ensemble.rs b/algorithms/linfa-ensemble/src/ensemble.rs
@@ -0,0 +1,157 @@
+use linfa::{
+ dataset::{AsTargets, AsTargetsMut, FromTargetArrayOwned, Records},
+ error::{Error},
+ traits::*,
+ DatasetBase,
+};
+use ndarray::{
+ Array2, Axis, Array, Dimension
+};
+use std::{
+ cmp::Eq,
+ collections::HashMap,
+ hash::Hash,
+};
+use rand::Rng;
+use rand::rngs::ThreadRng;
+
+pub struct EnsembleLearner<M> {
+ pub models: Vec<M>,
+}
+
+impl<M> EnsembleLearner<M> {
+
+ // Generates prediction iterator returning predictions from each model
+ pub fn generate_predictions<'b, R: Records, T>(&'b self, x: &'b R) -> impl Iterator<Item = T> + 'b
+ where M: Predict<&'b R, T> {
+ self.models.iter().map(move |m| m.predict(x))
+ }
+
+ // Consumes prediction iterator to return all predictions
+ pub fn aggregate_predictions<Ys: Iterator>(&self, ys: Ys)
+ -> impl Iterator<Item = Vec<(Array<<Ys::Item as AsTargets>::Elem, <<Ys::Item as AsTargets>::Ix as Dimension>::Smaller >, usize)>>
+ where
+ Ys::Item: AsTargets,
+ <Ys::Item as AsTargets>::Elem: Copy + Eq + Hash,
+ {
+ let mut prediction_maps = Vec::new();
+
+ for y in ys {
+ let targets = y.as_targets();
+ let no_targets = targets.shape()[0];
+
+ for i in 0..no_targets {
+ if prediction_maps.len() == i {
+ prediction_maps.push(HashMap::new());
+ }
+ *prediction_maps[i].entry(y.as_targets().index_axis(Axis(0), i).to_owned()).or_insert(0) += 1;
+ }
+ }
+
+ prediction_maps.into_iter().map(|xs| {
+ let mut xs: Vec<_> = xs.into_iter().collect();
+ xs.sort_by(|(_, x), (_, y)| y.cmp(x));
+ xs
+ })
+ }
+
+}
+
+impl<F: Clone, T, M>
+PredictInplace<Array2<F>, T> for EnsembleLearner<M>
+where
+ M: PredictInplace<Array2<F>, T>,
+ <T as AsTargets>::Elem: Copy + Eq + Hash,
+ T: AsTargets + AsTargetsMut<Elem = <T as AsTargets>::Elem>,
+{
+ fn predict_inplace(&self, x: &Array2<F>, y: &mut T) {
+ let mut y_array = y.as_targets_mut();
+ assert_eq!(
+ x.nrows(),
+ y_array.len_of(Axis(0)),
+ "The number of data points must match the number of outputs."
+ );
+
+ let mut predictions = self.generate_predictions(x);
+ let aggregated_predictions = self.aggregate_predictions(&mut predictions);
+
+ for (target, output) in y_array.axis_iter_mut(Axis(0)).zip(aggregated_predictions.into_iter()) {
+ for (t, o) in target.into_iter().zip(output[0].0.iter()) {
+ *t = *o;
+ }
+ }
+ }
+
+ fn default_target(&self, x: &Array2<F>) -> T {
+ self.models[0].default_target(x)
+ }
+}
+
+pub struct EnsembleLearnerParams<P, R: Rng + Clone> {
+ pub ensemble_size: usize,
+ pub bootstrap_proportion: f64,
+ pub model_params: P,
+ pub rng: R
+}
+
+impl<P> EnsembleLearnerParams<P, ThreadRng> {
+ pub fn new(model_params: P) -> EnsembleLearnerParams<P, ThreadRng> {
+ return Self::new_fixed_rng(model_params, rand::thread_rng())
+ }
+}
+
+impl<P, R: Rng + Clone> EnsembleLearnerParams<P, R> {
+ pub fn new_fixed_rng(model_params: P, rng: R) -> EnsembleLearnerParams<P, R> {
+ EnsembleLearnerParams {
+ ensemble_size: 1,
+ bootstrap_proportion: 1.0,
+ model_params: model_params,
+ rng: rng
+ }
+ }
+
+ pub fn ensemble_size(&mut self, size: usize) -> &mut EnsembleLearnerParams<P, R> {
+ assert!(size > 0, "ensemble_size cannot be less than 1. Ensembles must consist of at least one model.");
+ self.ensemble_size = size;
+ self
+ }
+
+ pub fn bootstrap_proportion(&mut self, proportion: f64) -> &mut EnsembleLearnerParams<P, R> {
+ assert!(proportion > 0.0, "bootstrap_proportion must be greater than 0. Must provide some data to each model.");
+ self.bootstrap_proportion = proportion;
+ self
+ }
+
+}
+
+impl<D, T, P: Fit<Array2<D>, T::Owned, Error>, R: Rng + Clone>
+ Fit<Array2<D>, T, Error> for EnsembleLearnerParams<P, R>
+where
+ D: Clone,
+ T: FromTargetArrayOwned,
+ T::Elem: Copy + Eq + Hash,
+ T::Owned: AsTargets,
+{
+ type Object = EnsembleLearner<P::Object>;
+
+ fn fit(&self, dataset: &DatasetBase<Array2<D>, T>) -> Result<Self::Object, Error> {
+
+ let mut models = Vec::new();
+ let mut rng = self.rng.clone();
+
+ let dataset_size = ((dataset.records.shape()[0] as f64) * self.bootstrap_proportion).ceil() as usize;
+
+ let iter = dataset.bootstrap_samples(dataset_size, &mut rng);
+
+ for train in iter {
+ let model = self.model_params.fit(&train).unwrap();
+ models.push(model);
+
+ if models.len() == self.ensemble_size {
+ break
+ }
+ }
+
+ Ok(EnsembleLearner { models })
+ }
+}
diff --git a/algorithms/linfa-ensemble/src/lib.rs b/algorithms/linfa-ensemble/src/lib.rs
@@ -0,0 +1,3 @@
+mod ensemble;
+
+pub use ensemble::*;
diff --git a/src/dataset/impl_dataset.rs b/src/dataset/impl_dataset.rs
@@ -2,7 +2,7 @@ use super::{
  super::traits::{Predict, PredictInplace},
  iter::{ChunksIter, DatasetIter, Iter},
  AsSingleTargets, AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView,
- Float, FromTargetArray, Label, Labels, Records, Result, TargetDim,
+ Float, FromTargetArray, FromTargetArrayOwned, Label, Labels, Records, Result, TargetDim,
 };
 use crate::traits::Fit;
 use ndarray::{concatenate, prelude::*, Data, DataMut, Dimension};
@@ -418,7 +418,7 @@ where
 impl<'b, F: Clone, E: Copy + 'b, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
 where
  D: Data<Elem = F>,
- T: FromTargetArray<'b, Elem = E>,
+ T: FromTargetArrayOwned<Elem = E>,
  T::Owned: AsTargets,
 {
  /// Apply bootstrapping for samples and features
@@ -441,7 +441,7 @@ where
  &'b self,
  sample_feature_size: (usize, usize),
  rng: &'b mut R,
- ) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
+ ) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
  std::iter::repeat(()).map(move |_| {
  // sample with replacement
  let indices = (0..sample_feature_size.0)
@@ -481,7 +481,7 @@ where
  &'b self,
  num_samples: usize,
  rng: &'b mut R,
- ) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
+ ) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
  std::iter::repeat(()).map(move |_| {
  // sample with replacement
  let indices = (0..num_samples)
@@ -515,7 +515,7 @@ where
  &'b self,
  num_features: usize,
  rng: &'b mut R,
- ) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
+ ) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
  std::iter::repeat(()).map(move |_| {
  let targets = T::new_targets(self.as_targets().to_owned());
 

diff --git a/src/dataset/impl_targets.rs b/src/dataset/impl_targets.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 
 use super::{
  AsMultiTargets, AsMultiTargetsMut, AsProbabilities, AsSingleTargets, AsSingleTargetsMut,
- AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, Label, Labels, Pr,
+ AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, FromTargetArrayOwned, Label, Labels, Pr,
  TargetDim,
 };
 use ndarray::{
@@ -25,21 +25,25 @@ impl<'a, L, S: Data<Elem = L>, I: TargetDim> AsTargets for ArrayBase<S, I> {
 impl<T: AsTargets<Ix = Ix1>> AsSingleTargets for T {}
 impl<T: AsTargets<Ix = Ix2>> AsMultiTargets for T {}
 
-impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArray<'a> for ArrayBase<S, I> {
+impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArrayOwned for ArrayBase<S, I> {
  type Owned = ArrayBase<OwnedRepr<L>, I>;
- type View = ArrayBase<ViewRepr<&'a L>, I>;
 
  /// Returns an owned representation of the target array
  fn new_targets(targets: Array<L, I>) -> Self::Owned {
  targets
  }
+}
+
+impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArray<'a> for ArrayBase<S, I> {
+ type View = ArrayBase<ViewRepr<&'a L>, I>;
 
  /// Returns a reference to the target array
  fn new_targets_view(targets: ArrayView<'a, L, I>) -> Self::View {
  targets
  }
 }
 
+
 impl<L, S: DataMut<Elem = L>, I: TargetDim> AsTargetsMut for ArrayBase<S, I> {
  type Elem = L;
  type Ix = I;
@@ -79,23 +83,29 @@ impl<L: Label, T: AsTargetsMut<Elem = L>> AsTargetsMut for CountedTargets<L, T>
  }
 }
 
-impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets<L, T>
+impl<L: Label, T> FromTargetArrayOwned for CountedTargets<L, T>
 where
- T: FromTargetArray<'a, Elem = L>,
+ T: FromTargetArrayOwned<Elem = L>,
  T::Owned: Labels<Elem = L>,
- T::View: Labels<Elem = L>,
 {
  type Owned = CountedTargets<L, T::Owned>;
- type View = CountedTargets<L, T::View>;
 
  fn new_targets(targets: Array<L, T::Ix>) -> Self::Owned {
  let targets = T::new_targets(targets);
-
  CountedTargets {
  labels: targets.label_count(),
  targets,
  }
  }
+}
+
+
+impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets<L, T>
+where
+ T: FromTargetArray<'a, Elem = L>,
+ T::View: Labels<Elem = L>,
+{
+ type View = CountedTargets<L, T::View>;
 
  fn new_targets_view(targets: ArrayView<'a, L, T::Ix>) -> Self::View {
  let targets = T::new_targets_view(targets);