Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap Aggregation #229

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions algorithms/linfa-ensemble/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[package]
name = "linfa-ensemble"
version = "0.6.0"
edition = "2018"
authors = ["James Knight <[email protected]>", "James Kay <[email protected]>"]
description = "A general method for creating ensemble classifiers"
license = "MIT/Apache-2.0"

repository = "https://github.com/rust-ml/linfa"
readme = "README.md"

keywords = ["machine-learning", "linfa", "ensemble"]
categories = ["algorithms", "mathematics", "science"]

[features]
default = []
serde = ["serde_crate", "ndarray/serde"]

[dependencies.serde_crate]
package = "serde"
optional = true
version = "1.0"
default-features = false
features = ["std", "derive"]

[dependencies]
ndarray = { version = "0.15" , features = ["rayon", "approx"]}
ndarray-rand = "0.14"
rand = "0.8.5"

linfa = { version = "0.6.0", path = "../.." }
linfa-trees = { version = "0.6.0", path = "../linfa-trees"}

[dev-dependencies]
linfa-datasets = { version = "0.6.0", path = "../../datasets/", features = ["iris"] }

21 changes: 21 additions & 0 deletions algorithms/linfa-ensemble/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Enseble Learning

`linfa-ensemble` provides pure Rust implementations of Ensemble Learning algorithms for the Linfa toolkit.

## The Big Picture

`linfa-ensemble` is a crate in the [`linfa`](https://crates.io/crates/linfa) ecosystem, an effort to create a toolkit for classical Machine Learning implemented in pure Rust, akin to Python's `scikit-learn`.

## Current state

`linfa-ensemble` currently provides an implementation of bootstrap aggregation (bagging) for other classifers provided in linfa.

## Examples

You can find examples in the `examples/` directory. To run an bootstrap aggregation for ensemble of decision trees (a Random Forest) use:

```bash
$ cargo run --example randomforest_iris --release
```


38 changes: 38 additions & 0 deletions algorithms/linfa-ensemble/examples/randomforest_iris.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use linfa::prelude::{Fit, Predict, ToConfusionMatrix};
use linfa_ensemble::{EnsembleLearnerParams};
use linfa_trees::DecisionTree;
use ndarray_rand::rand::SeedableRng;
use rand::rngs::SmallRng;

fn main() {
//Number of models in the ensemble
let ensemble_size = 100;
//Proportion of training data given to each model
let bootstrap_proportion = 0.7;

//Create ensemble learner
let mut learner = EnsembleLearnerParams::new();
learner
.ensemble_size(ensemble_size)
.bootstrap_proportion(bootstrap_proportion)
.model_params(DecisionTree::params());

//Load dataset
let mut rng = SmallRng::seed_from_u64(42);
let (train, test) = linfa_datasets::iris()
.shuffle(&mut rng)
.split_with_ratio(0.8);

//Train ensemble learner model
let model = learner.fit(&train).unwrap();

//Return highest ranking predictions
let final_predictions_ensemble = model.predict(&test);
println!("Final Predictions: \n{:?}", final_predictions_ensemble);

let cm = final_predictions_ensemble.confusion_matrix(&test).unwrap();

println!("{:?}", cm);
println!("Test accuracy: {} \n with default Decision Tree params, \n Ensemble Size: {},\n Bootstrap Proportion: {}",
100.0 * cm.accuracy(), ensemble_size, bootstrap_proportion);
}
157 changes: 157 additions & 0 deletions algorithms/linfa-ensemble/src/ensemble.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
use linfa::{
dataset::{AsTargets, AsTargetsMut, FromTargetArrayOwned, Records},
error::{Error},
traits::*,
DatasetBase,
};
use ndarray::{
Array2, Axis, Array, Dimension
};
use std::{
cmp::Eq,
collections::HashMap,
hash::Hash,
};
use rand::Rng;
use rand::rngs::ThreadRng;

pub struct EnsembleLearner<M> {
pub models: Vec<M>,
}

impl<M> EnsembleLearner<M> {

// Generates prediction iterator returning predictions from each model
pub fn generate_predictions<'b, R: Records, T>(&'b self, x: &'b R) -> impl Iterator<Item = T> + 'b
where M: Predict<&'b R, T> {
self.models.iter().map(move |m| m.predict(x))
}

// Consumes prediction iterator to return all predictions
pub fn aggregate_predictions<Ys: Iterator>(&self, ys: Ys)
-> impl Iterator<Item = Vec<(Array<<Ys::Item as AsTargets>::Elem, <<Ys::Item as AsTargets>::Ix as Dimension>::Smaller >, usize)>>
where
Ys::Item: AsTargets,
<Ys::Item as AsTargets>::Elem: Copy + Eq + Hash,
{
let mut prediction_maps = Vec::new();

for y in ys {
let targets = y.as_targets();
let no_targets = targets.shape()[0];

for i in 0..no_targets {
if prediction_maps.len() == i {
prediction_maps.push(HashMap::new());
}
*prediction_maps[i].entry(y.as_targets().index_axis(Axis(0), i).to_owned()).or_insert(0) += 1;
}
}

prediction_maps.into_iter().map(|xs| {
let mut xs: Vec<_> = xs.into_iter().collect();
xs.sort_by(|(_, x), (_, y)| y.cmp(x));
xs
})
}

}

impl<F: Clone, T, M>
PredictInplace<Array2<F>, T> for EnsembleLearner<M>
where
M: PredictInplace<Array2<F>, T>,
<T as AsTargets>::Elem: Copy + Eq + Hash,
T: AsTargets + AsTargetsMut<Elem = <T as AsTargets>::Elem>,
{
fn predict_inplace(&self, x: &Array2<F>, y: &mut T) {
let mut y_array = y.as_targets_mut();
assert_eq!(
x.nrows(),
y_array.len_of(Axis(0)),
YuhanLiin marked this conversation as resolved.
Show resolved Hide resolved
"The number of data points must match the number of outputs."
);

let mut predictions = self.generate_predictions(x);
let aggregated_predictions = self.aggregate_predictions(&mut predictions);

for (target, output) in y_array.axis_iter_mut(Axis(0)).zip(aggregated_predictions.into_iter()) {
for (t, o) in target.into_iter().zip(output[0].0.iter()) {
*t = *o;
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Replace with this:

// prediction map has same shape as y_array, but the elements are maps
let mut prediction_maps = y_array.map(|_| HashMap::new());

for prediction in predictions {
  let p_arr = prediction.as_targets();
  assert_eq!(p_arr.shape(), y_array.shape());
  // Insert each prediction value into the corresponding map
  Zip::from(&mut prediction_maps).and(&p_arr).for_each(|(&mut map, &val)| map.entry(val).or_insert(0) += 1);
}

// For each prediction, pick the result with the highest number of votes
y_array = prediction_maps.mapv_into(|map| map.iter().max_by_key(|(_, v)| v).0);

It picks out the predictions with the highest number of votes without the complexity of aggregate_predictions

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment still applies I believe.

}

fn default_target(&self, x: &Array2<F>) -> T {
self.models[0].default_target(x)
}
}

pub struct EnsembleLearnerParams<P, R: Rng + Clone> {
pub ensemble_size: usize,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a separate field for ensemble_size? Isn't this value implied by bootstrap_proportion?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ensemble_size gives the number of models in the ensemble while bootstrap_proportion gives the proportion of the total number of training samples that should be given to each model for training. These should be distinct parameters.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't bootstrap_proportion be the same as 1/ensemble_size?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily, each model in the ensemble just needs its own random set of samples of training data from the complete training data set. There are no constraints on the size of this set other than it being non-empty, so we let the user tune this size as a hyperparameter.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK so bootstrap_samples just grabs random sets of samples from the input and yields them infinitely. I thought it divided the input into random subsamples. This makes sense now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add this behaviour to the docs, along with a general description of EnsembleLearner? We should also have top level docs in src/lib.rs like with the other crates.

pub bootstrap_proportion: f64,
pub model_params: P,
pub rng: R
}

impl<P> EnsembleLearnerParams<P, ThreadRng> {
pub fn new(model_params: P) -> EnsembleLearnerParams<P, ThreadRng> {
return Self::new_fixed_rng(model_params, rand::thread_rng())
}
}

impl<P, R: Rng + Clone> EnsembleLearnerParams<P, R> {
pub fn new_fixed_rng(model_params: P, rng: R) -> EnsembleLearnerParams<P, R> {
EnsembleLearnerParams {
ensemble_size: 1,
bootstrap_proportion: 1.0,
model_params: model_params,
rng: rng
}
}

pub fn ensemble_size(&mut self, size: usize) -> &mut EnsembleLearnerParams<P, R> {
assert!(size > 0, "ensemble_size cannot be less than 1. Ensembles must consist of at least one model.");
YuhanLiin marked this conversation as resolved.
Show resolved Hide resolved
self.ensemble_size = size;
self
}

pub fn bootstrap_proportion(&mut self, proportion: f64) -> &mut EnsembleLearnerParams<P, R> {
assert!(proportion > 0.0, "bootstrap_proportion must be greater than 0. Must provide some data to each model.");
jk1015 marked this conversation as resolved.
Show resolved Hide resolved
self.bootstrap_proportion = proportion;
self
}

}

impl<D, T, P: Fit<Array2<D>, T::Owned, Error>, R: Rng + Clone>
Fit<Array2<D>, T, Error> for EnsembleLearnerParams<P, R>
where
D: Clone,
T: FromTargetArrayOwned,
T::Elem: Copy + Eq + Hash,
T::Owned: AsTargets,
{
type Object = EnsembleLearner<P::Object>;

fn fit(&self, dataset: &DatasetBase<Array2<D>, T>) -> Result<Self::Object, Error> {

let mut models = Vec::new();
let mut rng = self.rng.clone();

let dataset_size = ((dataset.records.shape()[0] as f64) * self.bootstrap_proportion).ceil() as usize;
YuhanLiin marked this conversation as resolved.
Show resolved Hide resolved

let iter = dataset.bootstrap_samples(dataset_size, &mut rng);

for train in iter {
let model = self.model_params.fit(&train).unwrap();
models.push(model);

if models.len() == self.ensemble_size {
break
}
}

Ok(EnsembleLearner { models })
}
}
3 changes: 3 additions & 0 deletions algorithms/linfa-ensemble/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
mod ensemble;

pub use ensemble::*;
10 changes: 5 additions & 5 deletions src/dataset/impl_dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::{
super::traits::{Predict, PredictInplace},
iter::{ChunksIter, DatasetIter, Iter},
AsSingleTargets, AsTargets, AsTargetsMut, CountedTargets, Dataset, DatasetBase, DatasetView,
Float, FromTargetArray, Label, Labels, Records, Result, TargetDim,
Float, FromTargetArray, FromTargetArrayOwned, Label, Labels, Records, Result, TargetDim,
};
use crate::traits::Fit;
use ndarray::{concatenate, prelude::*, Data, DataMut, Dimension};
Expand Down Expand Up @@ -418,7 +418,7 @@ where
impl<'b, F: Clone, E: Copy + 'b, D, T> DatasetBase<ArrayBase<D, Ix2>, T>
where
D: Data<Elem = F>,
T: FromTargetArray<'b, Elem = E>,
T: FromTargetArrayOwned<Elem = E>,
T::Owned: AsTargets,
{
/// Apply bootstrapping for samples and features
Expand All @@ -441,7 +441,7 @@ where
&'b self,
sample_feature_size: (usize, usize),
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
std::iter::repeat(()).map(move |_| {
// sample with replacement
let indices = (0..sample_feature_size.0)
Expand Down Expand Up @@ -481,7 +481,7 @@ where
&'b self,
num_samples: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
std::iter::repeat(()).map(move |_| {
// sample with replacement
let indices = (0..num_samples)
Expand Down Expand Up @@ -515,7 +515,7 @@ where
&'b self,
num_features: usize,
rng: &'b mut R,
) -> impl Iterator<Item = DatasetBase<Array2<F>, <T as FromTargetArray<'b>>::Owned>> + 'b {
) -> impl Iterator<Item = DatasetBase<Array2<F>, T::Owned>> + 'b {
std::iter::repeat(()).map(move |_| {
let targets = T::new_targets(self.as_targets().to_owned());

Expand Down
26 changes: 18 additions & 8 deletions src/dataset/impl_targets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::collections::HashMap;

use super::{
AsMultiTargets, AsMultiTargetsMut, AsProbabilities, AsSingleTargets, AsSingleTargetsMut,
AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, Label, Labels, Pr,
AsTargets, AsTargetsMut, CountedTargets, DatasetBase, FromTargetArray, FromTargetArrayOwned, Label, Labels, Pr,
TargetDim,
};
use ndarray::{
Expand All @@ -25,21 +25,25 @@ impl<'a, L, S: Data<Elem = L>, I: TargetDim> AsTargets for ArrayBase<S, I> {
impl<T: AsTargets<Ix = Ix1>> AsSingleTargets for T {}
impl<T: AsTargets<Ix = Ix2>> AsMultiTargets for T {}

impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArray<'a> for ArrayBase<S, I> {
impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArrayOwned for ArrayBase<S, I> {
type Owned = ArrayBase<OwnedRepr<L>, I>;
type View = ArrayBase<ViewRepr<&'a L>, I>;

/// Returns an owned representation of the target array
fn new_targets(targets: Array<L, I>) -> Self::Owned {
targets
}
}

impl<'a, L: Clone + 'a, S: Data<Elem = L>, I: TargetDim> FromTargetArray<'a> for ArrayBase<S, I> {
type View = ArrayBase<ViewRepr<&'a L>, I>;

/// Returns a reference to the target array
fn new_targets_view(targets: ArrayView<'a, L, I>) -> Self::View {
targets
}
}


impl<L, S: DataMut<Elem = L>, I: TargetDim> AsTargetsMut for ArrayBase<S, I> {
type Elem = L;
type Ix = I;
Expand Down Expand Up @@ -79,23 +83,29 @@ impl<L: Label, T: AsTargetsMut<Elem = L>> AsTargetsMut for CountedTargets<L, T>
}
}

impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets<L, T>
impl<L: Label, T> FromTargetArrayOwned for CountedTargets<L, T>
where
T: FromTargetArray<'a, Elem = L>,
T: FromTargetArrayOwned<Elem = L>,
T::Owned: Labels<Elem = L>,
T::View: Labels<Elem = L>,
{
type Owned = CountedTargets<L, T::Owned>;
type View = CountedTargets<L, T::View>;

fn new_targets(targets: Array<L, T::Ix>) -> Self::Owned {
let targets = T::new_targets(targets);

CountedTargets {
labels: targets.label_count(),
targets,
}
}
}


impl<'a, L: Label + 'a, T> FromTargetArray<'a> for CountedTargets<L, T>
where
T: FromTargetArray<'a, Elem = L>,
T::View: Labels<Elem = L>,
{
type View = CountedTargets<L, T::View>;

fn new_targets_view(targets: ArrayView<'a, L, T::Ix>) -> Self::View {
let targets = T::new_targets_view(targets);
Expand Down
Loading