diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..05d134e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog +## 0.1.1 +### Added +The ecDNA distribution is a vector of `u16` +The ecDNA distribution is a vector where each entries is a cell and the content is the number of ecDNA copies the cell has. +The ecDNA distribution is saved as a JSON, i.e. as a mapping where the keys are the ecDNA copies and the entries are the number of cells for each key (histogram). +Implement the [state pattern](https://doc.rust-lang.org/book/ch17-03-oo-design-patterns.html) to pass the ecDNA distribution by consuming the `Run` instead of deep copy. +Introduce subsampling: run the simulations for a number of cells, then run ABC with 100 subsamples for each run. In the end, the number of tested simulations will be 100 * the number of runs. + +### Fixed +Fix bug in the computation of the ks distance. diff --git a/Cargo.lock b/Cargo.lock index 9e26ee1..f6de85d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,19 +9,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] -name = "anyhow" -version = "1.0.47" +name = "aho-corasick" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] [[package]] -name = "approx" -version = "0.4.0" +name = "anyhow" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278" -dependencies = [ - "num-traits", -] +checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e" [[package]] name = "atty" @@ -46,6 +46,18 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -147,6 +159,28 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "darling" version = "0.12.4" @@ -226,17 +260,22 @@ name = "ecdna-evo" version = "0.1.0" dependencies = [ "anyhow", - "approx", "chrono", + "csv", "derive_builder", "enum_dispatch", + "fake", "flate2", "indicatif", - "kolmogorov_smirnov", - "rand 0.8.4", + "quickcheck", + "quickcheck_macros", + "rand", "rand_distr", "rayon", + "serde", + "serde_json", "tar", + "test-case", ] [[package]] @@ -263,6 +302,25 @@ dependencies = [ "syn", ] +[[package]] +name = "env_logger" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "fake" +version = "2.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a8531dd3a64fd1cfbe92fad4160bc2060489c6195fe847e045e5788f710bae" +dependencies = [ + "rand", +] + [[package]] name = "filetime" version = "0.2.15" @@ -293,12 +351,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - [[package]] name = "getrandom" version = "0.2.3" @@ -355,13 +407,16 @@ dependencies = [ ] [[package]] -name = "kolmogorov_smirnov" -version = "1.1.0" +name = "itoa" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96ceadb630ffd3ceb5c4d739232d8a9d51d23c383b14f3b64ffc67368cbc59b5" -dependencies = [ - "rand 0.3.23", -] +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" [[package]] name = "lazy_static" @@ -381,6 +436,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7d73b3f436185384286bd8098d17ec07c9a7d2388a6599f824d8502b529702a" +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + [[package]] name = "memchr" version = "2.4.1" @@ -473,35 +537,34 @@ dependencies = [ ] [[package]] -name = "quote" -version = "1.0.10" +name = "quickcheck" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" +checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" dependencies = [ - "proc-macro2", + "env_logger", + "log", + "rand", ] [[package]] -name = "rand" -version = "0.3.23" +name = "quickcheck_macros" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ac302d8f83c0c1974bf758f6b041c6c8ada916fbb44a609158ca8b064cc76c" +checksum = "b22a693222d716a9587786f37ac3f6b4faedb5b80c23914e7303ff5a1d8016e9" dependencies = [ - "libc", - "rand 0.4.6", + "proc-macro2", + "quote", + "syn", ] [[package]] -name = "rand" -version = "0.4.6" +name = "quote" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" +checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" dependencies = [ - "fuchsia-cprng", - "libc", - "rand_core 0.3.1", - "rdrand", - "winapi", + "proc-macro2", ] [[package]] @@ -512,7 +575,7 @@ checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" dependencies = [ "libc", "rand_chacha", - "rand_core 0.6.3", + "rand_core", "rand_hc", ] @@ -523,24 +586,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.3", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", + "rand_core", ] -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - [[package]] name = "rand_core" version = "0.6.3" @@ -557,7 +605,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "964d548f8e7d12e102ef183a0de7e98180c9f8729f555897a857b96e48122d2f" dependencies = [ "num-traits", - "rand 0.8.4", + "rand", ] [[package]] @@ -566,7 +614,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" dependencies = [ - "rand_core 0.6.3", + "rand_core", ] [[package]] @@ -594,15 +642,6 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - [[package]] name = "redox_syscall" version = "0.2.10" @@ -618,21 +657,66 @@ version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ + "aho-corasick", + "memchr", "regex-syntax", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + [[package]] name = "regex-syntax" version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d23c1ba4cf0efd44be32017709280b32d1cea5c3f1275c3b6d9e8bc54f758085" +dependencies = [ + "itoa 1.0.1", + "ryu", + "serde", +] + [[package]] name = "strsim" version = "0.10.0" @@ -680,6 +764,19 @@ dependencies = [ "winapi", ] +[[package]] +name = "test-case" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e5f048404b43e8ae66dce036163515b6057024cf58c6377be501f250bd3c6a" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "syn", + "version_check", +] + [[package]] name = "textwrap" version = "0.14.2" @@ -702,6 +799,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "wasi" version = "0.10.2+wasi-snapshot-preview1" diff --git a/ecdna-evo/Cargo.toml b/ecdna-evo/Cargo.toml index 452c884..2d63410 100644 --- a/ecdna-evo/Cargo.toml +++ b/ecdna-evo/Cargo.toml @@ -10,13 +10,21 @@ keywords = ["bayesian-inference", "gillespie", "stochastic-simulation", "ecdna"] [dependencies] rand = { version = "0.8.0", features = ["small_rng"]} rand_distr = "0.4.0" -approx = "0.4.0" indicatif = {version = "*", features = ["rayon"]} chrono = "0.4" rayon = "1.5" anyhow = "1.0" flate2 = "1.0" tar = "0.4" -kolmogorov_smirnov = "1.1.0" enum_dispatch = "0.3.7" derive_builder = "0.10.2" +serde_json = "1.0" +serde = { version = "1.0.136", features = ["derive"]} +csv = "1.1" + +[dev-dependencies] +fake = "2.4" +rand = "0.8" +quickcheck = "1" +quickcheck_macros = "1" +test-case = "1.2.3" diff --git a/ecdna-evo/src/abc.rs b/ecdna-evo/src/abc.rs index a63dc25..ae3be11 100644 --- a/ecdna-evo/src/abc.rs +++ b/ecdna-evo/src/abc.rs @@ -1,24 +1,33 @@ //! Perform the approximate Bayesian computation to infer the most probable //! fitness and death coefficients from the data. -use crate::data::{Differs, EcDNA, Entropy, Frequency, FromFile, Mean}; -use crate::run::Run; -use crate::simulation::write2file; -use std::collections::HashMap; +use crate::data::{Distance, EcDNADistribution, Entropy, Frequency, Mean}; +use crate::run::{Ended, Run}; +use crate::NbIndividuals; +use anyhow::Context; +use csv; +use rand::rngs::SmallRng; +use rand::SeedableRng; +use serde::Serialize; +use std::convert::TryFrom; +use std::fs; use std::path::{Path, PathBuf}; /// A collection of `Measurement`s representing the data available for a cancer /// patient. +#[derive(Debug)] pub struct Patient { - ecdna: Option, + ecdna: Option, mean: Option, frequency: Option, entropy: Option, + pub name: String, } impl Patient { - pub fn load(paths: PatientPaths, verbosity: u8) -> Self { + pub fn load(paths: PatientPaths, name: &str, verbosity: u8) -> Self { //! Load patient's data from paths let mut found_one = false; + let ecdna = { match paths.distribution { Some(path) => { @@ -26,7 +35,15 @@ impl Patient { if verbosity > 0u8 { println!("Loading ecdna distribution"); } - Some(EcDNA::from_file(&path)) + Some(EcDNADistribution::try_from(&path) + .with_context(|| { + format!( + "Cannot load the ecDNA distribution from {:#?}", + path + ) + }) + .unwrap(), + ) } None => None, @@ -40,7 +57,16 @@ impl Patient { if verbosity > 0u8 { println!("Loading mean"); } - Some(Mean::from_file(&path)) + Some( + Mean::try_from(&path) + .with_context(|| { + format!( + "Cannot load the mean from {:#?}", + path + ) + }) + .unwrap(), + ) } None => None, } @@ -53,7 +79,16 @@ impl Patient { if verbosity > 0u8 { println!("Loading frequency"); } - Some(Frequency::from_file(&path)) + Some( + Frequency::try_from(&path) + .with_context(|| { + format!( + "Cannot load the frequency from {:#?}", + path + ) + }) + .unwrap(), + ) } None => None, } @@ -66,7 +101,16 @@ impl Patient { if verbosity > 0u8 { println!("Loading entropy"); } - Some(Entropy::from_file(&path)) + Some( + Entropy::try_from(&path) + .with_context(|| { + format!( + "Cannot load the entropy from {:#?}", + path + ) + }) + .unwrap(), + ) } None => None, } @@ -74,12 +118,7 @@ impl Patient { assert!(found_one, "Must provide at least one path to load data"); - Patient { - ecdna, - mean, - frequency, - entropy, - } + Patient { ecdna, mean, frequency, entropy, name: name.to_string() } } } @@ -112,105 +151,118 @@ pub struct PatientPaths { pub struct ABCRejection; impl ABCRejection { - pub fn run(run: Run, patient: &Patient) -> ABCResults { + pub fn run(run: &Run, patient: &Patient) -> ABCResults { //! Run the ABC rejection method by comparing the run against the //! patient's data - let mut results = HashMap::with_capacity(4); + let nb_samples = 100usize; + let mut results = ABCResults(Vec::with_capacity(nb_samples)); + if let Some(cells) = run.parameters.subsample { + // create multiple subsamples of the same run and save the results + // in the same file `path`. It's ok as long as cells is not too + // big because deep copies of the ecDNA distribution for each + // subsample + let mut rng = SmallRng::from_entropy(); + for i in 0usize..nb_samples { + // returns new ecDNA distribution with cells NPlus cells (clone) + let sampled = run.undersample_ecdna(&cells, &mut rng, i); + results.0.push(ABCRejection::run_it(&sampled, patient)); + } + } else { + results.0.push(ABCRejection::run_it(run, patient)); + } + results + } + + fn run_it(run: &Run, patient: &Patient) -> ABCResult { + let mut builder = ABCResultBuilder::default(); if let Some(ecdna) = &patient.ecdna { - let (different, distance) = ecdna.is_different(&run); - results.insert("ecdna".to_string(), (!(different) as u8, distance)); + builder.ecdna(ecdna.distance(run)); } if let Some(mean) = &patient.mean { - let (different, distance) = mean.is_different(&run); - results.insert("mean".to_string(), (!(different) as u8, distance)); + builder.mean(mean.distance(run)); } if let Some(frequency) = &patient.frequency { - let (different, distance) = frequency.is_different(&run); - results.insert("frequency".to_string(), (!(different) as u8, distance)); + builder.frequency(frequency.distance(run)); } if let Some(entropy) = &patient.entropy { - let (different, distance) = entropy.is_different(&run); - results.insert("entropy".to_string(), (!(different) as u8, distance)); + builder.entropy(entropy.distance(run)); + } + + let idx = run.idx.to_string(); + if let Some(run) = run.get_parental_run() { + builder.parental_idx(*run); } - assert!(!results.is_empty(), "Found no data for the patient"); - ABCResults(results) + let mut result = + builder.build().expect("Cannot run ABC without any data"); + let rates = run.get_rates(); + result.f1 = rates[0]; + result.f2 = rates[1]; + result.d1 = rates[2]; + result.d2 = rates[3]; + + result.idx = idx; + result } } /// Results of the ABC rejection algorithm, i.e. the posterior distributions of /// the rates. There is one `ABCResults` for each run. -pub struct ABCResults(HashMap); +pub struct ABCResults(Vec); + +#[derive(Builder, Debug, Serialize)] +struct ABCResult { + #[builder(setter(strip_option), default)] + parental_idx: Option, + #[builder(setter(skip))] + idx: String, + #[builder(setter(strip_option), default)] + ecdna: Option, + #[builder(setter(strip_option), default)] + mean: Option, + #[builder(setter(strip_option), default)] + frequency: Option, + #[builder(setter(strip_option), default)] + entropy: Option, + #[builder(setter(skip))] + f1: f32, + #[builder(setter(skip))] + f2: f32, + #[builder(setter(skip))] + d1: f32, + #[builder(setter(skip))] + d2: f32, +} impl ABCResults { - pub fn save(&self, filename: &Path, path2folder: &Path, rates: &[f32]) -> bool { - //! Save the results of the abc inference for a run generating the - //! following files: - //! - //! 1. metadata: stores whether the statistics that are similar (1) - //! or different (0) for ABC (ground truth vs simulation) for each run - //! - //! 2. values: the values of the summary statistics for each run - //! - //! 3. all_rates: the rates of all runs, also those that have been - //! rejected and thus do not contribute to the posterior distribution - //! - //! 4. rates: the posterior distribution of the rates, here the rates - //! are only those that contribute to the posterior distribution as - //! opposed to file all_rates - let header: String = self - .0 - .keys() - .fold(String::new(), |accum, stat| accum + stat + ","); - - // write metadata - write2file( - &self - .0 - .values() - .map(|(is_different, _)| is_different) - .collect::>(), - &path2folder.join("metadata").join(filename), - Some(&header), - ) - .unwrap(); - - write2file( - &self - .0 - .values() - .map(|(_, distance)| distance) - .collect::>(), - &path2folder.join("values").join(filename), - Some(&header), - ) - .unwrap(); - - // save all runs to plot the histograms for each individual statistic - // Save the proliferation and death rates of the cells w/ ecDNA and - // w/o ecDNA respectively - write2file( - rates, - &path2folder.join("all_rates").join(filename), - Some("f1,f2,d1,d2"), - ) - .unwrap(); - - // write rates only if all stats pass the test - if self.positive() { - write2file(rates, &path2folder.join("rates").join(filename), None).unwrap(); - true + pub fn save( + &self, + path2folder: &Path, + subsamples: &Option, + ) -> anyhow::Result<()> { + //! Save the results of the abc inference for a run in a folder abc, where there is one file for each (parental) run and the name of the file corresponds to the parental run. + let path2save = if subsamples.is_some() { + path2folder.to_owned().join("samples") } else { - false - } - } + path2folder.to_owned() + }; + let filename = match self.0[0].parental_idx { + Some(run) => run.to_string(), + None => self.0[0].idx.clone(), + }; + let mut path2file = path2save.join("abc").join(filename); + path2file.set_extension("csv"); + fs::create_dir_all(path2file.parent().unwrap()) + .expect("Cannot create dir"); - pub fn positive(&self) -> bool { - //! A positive result means that the run is similar to the patient's - //! data - self.0.values().all(|&result| result.0 > 0u8) + let mut wtr = csv::Writer::from_path(path2file)?; + for record in self.0.iter() { + wtr.serialize(record)?; + wtr.flush()?; + } + Ok(()) } } diff --git a/ecdna-evo/src/data.rs b/ecdna-evo/src/data.rs index 32217af..fe7c7f4 100644 --- a/ecdna-evo/src/data.rs +++ b/ecdna-evo/src/data.rs @@ -1,142 +1,249 @@ //! The data of interest, such as the ecDNA distribution, its mean, its entropy, //! the frequency of cells w/ ecDNA etc... -use crate::run::Run; -use crate::simulation::{write2file, ToFile}; -use crate::{DNACopy, EcDNADistribution, NbIndividuals, Parameters}; -use anyhow::Context; -use kolmogorov_smirnov as ks; +use crate::run::{Ended, Run}; +use crate::{DNACopy, NbIndividuals, Parameters}; +use anyhow::{bail, ensure, Context}; +use enum_dispatch::enum_dispatch; +use rand::distributions::WeightedIndex; +use rand::rngs::SmallRng; +use rand::SeedableRng; +use rand_distr::Distribution; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::convert::TryFrom; +use std::fs; use std::fs::read_to_string; +use std::io::{BufWriter, Write}; use std::ops::Deref; -use std::path::Path; +use std::path::{Path, PathBuf}; /// The main trait for the `Measurement` which defines how to compare the `Run` /// against the `Measurement`. Types that are `Measurement` must implement -/// `Differs` +/// `Distance` /// -/// # How can I implement `Differs`? +/// # How can I implement `Distance`? /// An example of `Measurement` comparing the ecDNA copy number mean of the /// patient against the one simulated by the run: /// /// ```no_run -/// use ecdna_evo::data::{euclidean_distance, relative_change, Differs}; -/// use ecdna_evo::Run; +/// use ecdna_evo::data::{euclidean_distance, relative_change, Distance}; +/// use ecdna_evo::run::{Ended, Run}; /// /// pub struct Mean(pub f32); /// -/// impl Differs for Mean { -/// fn is_different(&self, run: &Run) -> (bool, f32) { +/// impl Distance for Mean { +/// fn distance(&self, run: &Run) -> f32 { /// //! The run and the patient's data differ when absolute difference /// //! between the means considering `NMinus` cells is greater than a /// //! threshold. -/// let change = relative_change(&self.0, &run.data.mean); -/// return (change > 0.25, change); +/// relative_change(&self.0, &run.get_mean()) /// } /// } /// ``` -pub trait Differs { +pub trait Distance { /// The data differs from the run when the distance between a `Measurement` /// according to a certain metric is higher than a threshold - fn is_different(&self, run: &Run) -> (bool, f32); + fn distance(&self, run: &Run) -> f32; } -/// Initialize numerical data reading from file. Panics if the conversion from -/// string fails or there is an IO error -pub trait FromFile { - fn from_file(path2file: &Path) -> Self; +/// Trait to write the data to file +#[enum_dispatch] +pub trait ToFile { + fn save(&self, path2file: &Path) -> anyhow::Result<()>; } -impl FromFile for EcDNA { - fn from_file(path2file: &Path) -> Self { - EcDNA(EcDNADistribution::from( - read_csv::(path2file) - .with_context(|| format!("Cannot load the distribution from {:#?}", path2file)) - .unwrap(), - )) +pub fn write2file( + data: &[T], + path: &Path, + header: Option<&str>, + endline: bool, +) -> anyhow::Result<()> { + //! Write vector of float into new file with a precision of 4 decimals. + //! Write NAN if the slice to write to file is empty. + fs::create_dir_all(path.parent().unwrap()).expect("Cannot create dir"); + let f = fs::OpenOptions::new() + .read(true) + .append(true) + .create(true) + .open(path)?; + let mut buffer = BufWriter::new(f); + if !data.is_empty() { + if let Some(h) = header { + writeln!(buffer, "{}", h)?; + } + write!(buffer, "{:.4}", data.first().unwrap())?; + for ele in data.iter().skip(1) { + write!(buffer, ",{:.4}", ele)?; + } + if endline { + writeln!(buffer)?; + } + } else { + write!(buffer, "{}", f32::NAN)?; } + Ok(()) } -impl Differs for EcDNA { - fn is_different(&self, run: &Run) -> (bool, f32) { +/// Load from file the the ecDNA distribution. The file can be either a json (histogram) or a csv (single-cell ecDNA distribution) +impl TryFrom<&PathBuf> for EcDNADistribution { + type Error = anyhow::Error; + + fn try_from(path2file: &PathBuf) -> anyhow::Result { + let extension = path2file + .extension() + .with_context(|| { + format!("Do not recognize extension of file {:#?}", path2file) + }) + .unwrap(); + + match extension.to_str() { + Some("csv") => { + read_csv::(path2file).map(EcDNADistribution::from) + } + Some("json") => { + serde_json::from_str(&fs::read_to_string(path2file).unwrap()) + .map_err(|e| anyhow::anyhow!(e)) + } + _ => panic!("Extension not recognized!"), + } + } +} + +impl Distance for EcDNADistribution { + fn distance(&self, run: &Run) -> f32 { //! The run and the patient's data ecDNA distributions (considering //! cells w/o ecDNA) are different if the Kolmogorov-Smirnov statistic //! is greater than a certain threshold or if there are less than 10 //! cells // do not compute the KS statistics with less than 10 datapoints - let distance = { - if self.len() <= 10usize || run.data.ecdna.len() <= 10usize { - f32::INFINITY - } else { - ks::test(self, &run.data.ecdna, 0.95).statistic as f32 - } - }; - (distance >= 0.02f32, distance) + let too_few_cells = + self.nb_cells() <= 10u64 || run.get_nb_cells() <= 10u64; + if too_few_cells { + f32::INFINITY + } else { + self.ks_distance(run.get_ecdna()).0 + } } } -impl FromFile for Mean { - fn from_file(path2file: &Path) -> Self { - Mean( - *read_csv::(path2file) - .with_context(|| format!("Cannot load the mean from {:#?}", path2file)) - .unwrap() - .first() - .unwrap(), - ) +impl TryFrom<&EcDNADistribution> for Mean { + type Error = &'static str; + + fn try_from(ecdna: &EcDNADistribution) -> Result { + if ecdna.is_empty() { + Err("Mean only accepts non empty ecDNA distributions") + } else { + ecdna.mean().map_err(|_| stringify!(e)) + } } } -impl Differs for Mean { - fn is_different(&self, run: &Run) -> (bool, f32) { +/// load from file the mean of the ecdna distribution. the mean will be the first entry of the file. +impl TryFrom<&PathBuf> for Mean { + type Error = anyhow::Error; + + fn try_from(path2file: &PathBuf) -> anyhow::Result { + let extension = path2file + .extension() + .with_context(|| { + format!("do not recognize extension of file {:#?}", path2file) + }) + .unwrap(); + + match extension.to_str() { + Some("csv") => read_csv::(path2file) + .map(|mean| Mean(*mean.first().unwrap())), + _ => panic!("extension not recognized: the file must be a csv!"), + } + } +} + +impl Distance for Mean { + fn distance(&self, run: &Run) -> f32 { //! The run and the patient's data differ when the absolute difference //! between the means considering `NMinus` cells is greater than a //! threshold. - let change = relative_change(self, &run.data.mean); - (change > 0.25, change) + relative_change(self, run.get_mean()) } } -impl FromFile for Frequency { - fn from_file(path2file: &Path) -> Self { - Frequency( - *read_csv::(path2file) - .with_context(|| format!("Cannot load the mean from {:#?}", path2file)) - .unwrap() - .first() - .unwrap(), - ) +/// load from file the mean of the ecdna distribution. the mean will be the first entry of the file. +impl TryFrom<&PathBuf> for Frequency { + type Error = anyhow::Error; + + fn try_from(path2file: &PathBuf) -> anyhow::Result { + let extension = path2file + .extension() + .with_context(|| { + format!("do not recognize extension of file {:#?}", path2file) + }) + .unwrap(); + + match extension.to_str() { + Some("csv") => read_csv::(path2file) + .map(|frequency| Frequency(*frequency.first().unwrap())), + _ => panic!("extension not recognized: the file must be a csv!"), + } } } -impl Differs for Frequency { - fn is_different(&self, run: &Run) -> (bool, f32) { - //! The run and the patient's data differ when the absolute difference - //! between the frequencies considering `NMinus` cells is greater than a - //! threshold. - let change = relative_change(self, &run.data.frequency); - (change > 0.25, change) +impl TryFrom<&EcDNADistribution> for Frequency { + type Error = &'static str; + + fn try_from(ecdna: &EcDNADistribution) -> Result { + if ecdna.is_empty() { + Err("Frequency only accepts non empty ecDNA distributions") + } else { + ecdna.frequency().map_err(|_| stringify!(e)) + } } } -impl FromFile for Entropy { - fn from_file(path2file: &Path) -> Self { - Entropy( - *read_csv::(path2file) - .with_context(|| format!("Cannot load the entropy from {:#?}", path2file)) - .unwrap() - .first() - .unwrap(), - ) +impl Distance for Frequency { + fn distance(&self, run: &Run) -> f32 { + relative_change(self, run.get_frequency()) + } +} + +impl TryFrom<&EcDNADistribution> for Entropy { + type Error = &'static str; + + fn try_from(ecdna: &EcDNADistribution) -> Result { + if ecdna.is_empty() { + Err("Entropy only accepts non empty ecDNA distributions") + } else { + ecdna.entropy().map_err(|_| stringify!(e)) + } } } -impl Differs for Entropy { - fn is_different(&self, run: &Run) -> (bool, f32) { +/// Load from file the entropy of the ecDNA distribution, i.e. the first entry of the file. +impl TryFrom<&PathBuf> for Entropy { + type Error = anyhow::Error; + + fn try_from(path2file: &PathBuf) -> anyhow::Result { + let extension = path2file + .extension() + .with_context(|| { + format!("Do not recognize extension of file {:#?}", path2file) + }) + .unwrap(); + + match extension.to_str() { + Some("csv") => read_csv::(path2file) + .map(|entropy| Entropy(*entropy.first().unwrap())), + _ => panic!("Extension not recognized: the file must be a csv!"), + } + } +} + +impl Distance for Entropy { + fn distance(&self, run: &Run) -> f32 { //! The run and the patient's data differ when the absolute difference //! between the entropies considering `NMinus` cells is greater than a //! threshold. - let change = relative_change(self, &run.data.entropy); - (change > 0.25, change) + relative_change(self, run.get_entropy()) } } @@ -163,17 +270,24 @@ where parse_csv_string(&str_data, data) } -fn parse_csv_string(str_data: &str, mut data: Vec) -> anyhow::Result> +fn parse_csv_string( + str_data: &str, + mut data: Vec, +) -> anyhow::Result> where T: std::str::FromStr, ::Err: std::fmt::Debug, { for result in str_data.split(',') { - let val = result.parse::().expect("Error while parsing csv"); - if result == f32::NAN.to_string() { - panic!("Found {} in file", result); + match result.parse::() { + Ok(val) => { + if result == f32::NAN.to_string() { + panic!("Found {} in file", result); + } + data.push(val); + } + _ => bail!("Cannot parse csv: {}", result), } - data.push(val); } Ok(data) } @@ -189,84 +303,113 @@ pub fn euclidean_distance(val1: f32, val2: f32) -> f32 { } pub struct Data { - pub ecdna: EcDNA, + pub ecdna: EcDNADistribution, pub mean: Mean, pub frequency: Frequency, pub entropy: Entropy, } +impl TryFrom<&EcDNADistribution> for Data { + type Error = anyhow::Error; + + fn try_from(ecdna: &EcDNADistribution) -> anyhow::Result { + if ecdna.is_empty() { + Err(anyhow::anyhow!( + "Data only accepts non empty ecDNA distributions" + )) + } else { + let mean = Mean::try_from(ecdna).unwrap(); + let frequency = Frequency::try_from(ecdna).unwrap(); + let entropy = Entropy::try_from(ecdna).unwrap(); + Ok(Data { ecdna: ecdna.clone(), mean, frequency, entropy }) + } + } +} + impl Data { - pub fn new(ecdna: EcDNA, mean: Mean, frequency: Frequency, entropy: Entropy) -> Self { - Data { - ecdna, - mean, - frequency, - entropy, + pub fn save( + &self, + abspath: &Path, + run_idx: &Path, + subsample: &Option, + ) { + if let Some(cells) = subsample { + let mut rng = SmallRng::from_entropy(); + let abspath_sampling = abspath.join("samples"); + let data = self.undersample_ecdna(cells, &mut rng); + data.save_data(&abspath_sampling, run_idx, true); } + self.save_data(abspath, run_idx, true); } - pub fn save(&self, file2path: &Path, filename: &Path, save_ecdna: bool) { + fn save_data(&self, file2path: &Path, filename: &Path, save_ecdna: bool) { if save_ecdna { - let ecdna = file2path.join("ecdna").join(filename); - write2file(&self.ecdna, &ecdna, None).expect("Cannot save the ecDNA distribution"); + let mut ecdna = file2path.join("ecdna").join(filename); + ecdna.set_extension("json"); + + self.ecdna + .save(&ecdna) + .expect("Cannot save the ecDNA distribution"); } - let mean = file2path.join("mean").join(filename); - write2file(&[*self.mean], &mean, None).expect("Cannot save the ecDNA distribution mean"); + let mut mean = file2path.join("mean").join(filename); + mean.set_extension("csv"); + self.mean + .save(&mean) + .expect("Cannot save the ecDNA distribution mean"); - let frequency = file2path.join("frequency").join(filename); - write2file(&[*self.frequency], &frequency, None).expect("Cannot save the frequency"); + let mut frequency = file2path.join("frequency").join(filename); + frequency.set_extension("csv"); + self.frequency.save(&frequency).expect("Cannot save the frequency"); - let entropy = file2path.join("entropy").join(filename); - write2file(&[*self.entropy], &entropy, None) + let mut entropy = file2path.join("entropy").join(filename); + entropy.set_extension("csv"); + self.entropy + .save(&entropy) .expect("Cannot save the ecDNA distribution entropy"); } -} - -#[derive(Clone, Default, Debug)] -pub struct EcDNA(pub EcDNADistribution); - -impl EcDNA { - pub fn get_ecdna_distr(&self) -> &EcDNADistribution { - &self.0 - } - pub fn ntot(&self) -> NbIndividuals { - self.0.len().try_into().unwrap() + fn undersample_ecdna( + &self, + nb_cells: &NbIndividuals, + rng: &mut SmallRng, + ) -> Self { + //! Returns a copy of the run with subsampled ecDNA distribution + let data = self + .ecdna + .undersample_data(nb_cells, rng) + .with_context(|| { + "Cannot undersample_data from empty ecDNA distribution" + }) + .unwrap(); + + assert_eq!( + &data.ecdna.nb_cells(), + nb_cells, + "Wrong undersampling of the ecDNA distirbution: {} cells expected after sampling, found {}, {:#?}", nb_cells, data.ecdna.nb_cells(), data.ecdna + ); + + data } } -impl ToFile for EcDNA { +impl ToFile for EcDNADistribution { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.0, path2file, None)?; + let data = serde_json::to_string(self) + .expect("Cannot serialize the ecDNA distribution"); + fs::create_dir_all(path2file.parent().unwrap()) + .expect("Cannot create dir"); + fs::write(path2file, data)?; Ok(()) } } -impl Deref for EcDNA { - type Target = EcDNADistribution; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct Mean(pub f32); -impl Mean { - pub fn new(parameters: &Parameters) -> Self { - Mean(parameters.init_copies as f32) - } - - pub fn get_mean(&self) -> &f32 { - &self.0 - } -} - impl ToFile for Mean { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&[self.0], path2file, None)?; + write2file(&[self.0], path2file, None, false)?; Ok(()) } } @@ -288,43 +431,527 @@ impl_deref!(Frequency); impl_deref!(Entropy); /// The frequency of cells with ecDNA at last iteration -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct Frequency(pub f32); impl Frequency { pub fn new(parameters: &Parameters) -> Self { let init_nplus = parameters.init_nplus as u64; - Frequency(init_nplus as f32 / ((init_nplus + parameters.init_nminus) as f32)) - } - - pub fn get_frequency(&self) -> &f32 { - &self.0 + Frequency( + init_nplus as f32 / ((init_nplus + parameters.init_nminus) as f32), + ) } } impl ToFile for Frequency { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&[self.0], path2file, None)?; + write2file(&[self.0], path2file, None, false)?; Ok(()) } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub struct Entropy(pub f32); impl Entropy { pub fn new(parameters: &Parameters) -> Self { Entropy(parameters.init_copies as f32) } - - pub fn get_entropy(&self) -> &f32 { - &self.0 - } } impl ToFile for Entropy { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&[self.0], path2file, None)?; + write2file(&[self.0], path2file, None, false)?; Ok(()) } } + +/// The distribution of ecDNA copies considering the cells w/o any ecDNA copy +/// represented as an histogram. +#[derive(Clone, PartialEq, Debug, Default, Deserialize, Serialize)] +pub struct EcDNADistribution { + distribution: HashMap, + /// Number of total (`NPlus` and `NMinus`) cells + ntot: NbIndividuals, +} + +impl From> for EcDNADistribution { + fn from(distribution: HashMap) -> Self { + let ntot = distribution.values().sum(); + EcDNADistribution { distribution, ntot } + } +} + +impl From> for EcDNADistribution { + fn from(distr: Vec) -> Self { + let mut mapping: HashMap = HashMap::new(); + let ntot = distr.len() as NbIndividuals; + distr + .into_iter() + .for_each(|ecdna| *mapping.entry(ecdna).or_default() += 1); + EcDNADistribution { distribution: mapping, ntot } + } +} + +impl EcDNADistribution { + pub fn ks_distance(&self, ecdna: &EcDNADistribution) -> (f32, bool) { + //! The ks distance represents the maximal absolute distance between the + //! empirical cumulative distributions of two `EcDNADistribution`s. + //! + //! Compute ks distance with `NPlus` and `NMinus` cells, and returns the + //! distance as well whether the loop stopped before reaching the maximal + //! allowed copy number `u16::MAX`, which is the case when the distance is + //! 1 ie maximal, or one of the cumulative distributions have reached 1 + //! and thus the distance can only decrease monotonically. + //! + //! Does **not** panic if empty distributions. + // Start from one which is the max value of the distance + let mut distance = 0f32; + // Compare the two empirical cumulative distributions (self) and ecdna + let mut ecdf = 0f32; + let mut ecdf_other = 0f32; + + // do not test small samples because ks is not reliable (ecdf) + if self.ntot < 10u64 || ecdna.ntot < 10u64 { + return (1f32, false); + } + + // iter over all ecDNA copies present in both data (self) and ecdna + for copy in 0u16..u16::MAX { + if let Some(ecdna_copy) = self.distribution.get(©) { + ecdf += (*ecdna_copy as f32) / (self.ntot as f32); + } + + if let Some(ecdna_copy) = ecdna.distribution.get(©) { + ecdf_other += (*ecdna_copy as f32) / (ecdna.ntot as f32); + } + + // store the maximal distance between the two ecdfs + let diff = (ecdf - ecdf_other).abs(); + if diff - distance > f32::EPSILON { + distance = diff; + } + + // check if any of the ecdf have reached 1. If it's the case + // the difference will decrease monotonically and we can stop + let max_dist = (ecdf - 1.0).abs() <= f32::EPSILON + || (ecdf_other - 1.0).abs() <= f32::EPSILON + || (distance - 1.0).abs() <= f32::EPSILON; + if max_dist { + return (distance, true); + } + } + (distance, false) + } + + fn mean(&self) -> anyhow::Result { + ensure!( + !self.distribution.is_empty(), + "Cannot compute mean from empty distribution" + ); + let sum = + self.distribution.iter().fold(0u64, |accum, (copy, count)| { + accum + (*copy as u64) * *count + }) as f32; + Ok(Mean(sum / (self.ntot as f32))) + } + + fn frequency(&self) -> anyhow::Result { + ensure!( + !self.distribution.is_empty(), + "Cannot compute frequency from empty distribution" + ); + let nminus = + self.distribution.get(&0u16).cloned().unwrap_or_default() as f32; + Ok(Frequency(1f32 - nminus / (self.ntot as f32))) + } + + fn entropy(&self) -> anyhow::Result { + ensure!( + !self.distribution.is_empty(), + "Cannot compute entropy from empty distribution" + ); + Ok(Entropy( + -self + .distribution + .values() + .map(|&count| { + let prob = (count as f32) / (self.ntot as f32); + prob * prob.log2() + }) + .sum::(), + )) + } + + pub fn is_empty(&self) -> bool { + self.distribution.is_empty() + } + + pub fn undersample_data( + &self, + nb_cells: &NbIndividuals, + rng: &mut SmallRng, + ) -> anyhow::Result { + let ecdna = self.undersample(nb_cells, rng); + Data::try_from(&ecdna) + } + + fn undersample( + &self, + nb_cells: &NbIndividuals, + rng: &mut SmallRng, + ) -> EcDNADistribution { + //! Undersample the ecDNA distribution taking roughly sample the proportion of ecDNA copies in cells found in the tumour + assert!( + !self.is_empty(), + "Cannot undersample from empty ecDNA distribution" + ); + + assert!(nb_cells <= &self.nb_cells(), "Cannot undersample with `nb_cells` greater than the cells found in the ecDNA distribution"); + + let ecdna = self.distribution.clone(); + // ecdna.remove(&0u16); // remove NMinus cells + if ecdna.is_empty() { + return EcDNADistribution::from(ecdna); + } + let ecdna = + ecdna.into_iter().collect::>(); + + let mut distribution: HashMap = HashMap::new(); + let sampler = + WeightedIndex::new(ecdna.iter().map(|item| item.1)).unwrap(); + + for _ in 0..*nb_cells { + *distribution.entry(ecdna[sampler.sample(rng)].0).or_default() += + 1; + } + + EcDNADistribution::from(distribution) + } + + pub fn copies(&self) -> HashSet { + //! Get all the ecDNA copies available in the population (the x values + //! in the histogram). + self.distribution.keys().copied().collect() + } + + pub fn nb_cells(&self) -> NbIndividuals { + self.distribution.values().sum::() + } +} + +#[cfg(test)] +extern crate quickcheck; + +#[cfg(test)] +mod tests { + use super::*; + use fake::Fake; + use quickcheck::{quickcheck, Gen}; + use rand::rngs::SmallRng; + use rand::{Rng, SeedableRng}; + use test_case::test_case; + + #[test] + fn from_map() { + let original_data = + HashMap::from([(0u16, 12u64), (2u16, 1u64), (10u16, 3u64)]); + let ecdna = EcDNADistribution::from(original_data.clone()); + assert_eq!(ecdna.distribution, original_data); + assert_eq!(ecdna.ntot, 16u64); + } + + #[test] + fn from_vector() { + let original_data = vec![0u16, 2u16, 10u16]; + let ntot = original_data.len(); + let ecdna = EcDNADistribution::from(original_data); + let hist = HashMap::from([(0u16, 1u64), (2u16, 1u64), (10u16, 1u64)]); + assert_eq!(ecdna.distribution, hist); + assert_eq!(ecdna.ntot, ntot as NbIndividuals); + } + + #[test] + fn from_vec_for_ecdna_distribution_empty() { + let my_vec = vec![]; + let dna = EcDNADistribution::from(my_vec); + assert!(dna.is_empty()); + } + + #[test] + fn from_vector_multiple_values() { + let original_data = vec![0u16, 2u16, 2u16, 10u16]; + let ecdna = EcDNADistribution::from(original_data); + let hist = HashMap::from([(0u16, 1u64), (2u16, 2u64), (10u16, 1u64)]); + assert_eq!(ecdna.distribution, hist); + } + + #[test_case(vec![0u16, 1u16, 2u16] => Mean(1f32) ; "Balanced input")] + #[test_case(vec![1u16, 1u16, 1u16] => Mean(1f32) ; "Constant input")] + #[test_case(vec![0u16, 2u16] => Mean(1f32) ; "Unbalanced input")] + fn ecdna_mean_1(ecdna: Vec) -> Mean { + EcDNADistribution::from(ecdna).mean().unwrap() + } + + #[test] + #[should_panic] + fn ecdna_mean_empty() { + EcDNADistribution::from(vec![]).mean().unwrap(); + } + + #[test_case(vec![1u16, 1u16, 1u16] => Frequency(1f32) ; "NPlus")] + #[test_case(vec![0u16] => Frequency(0f32) ; "Nminus")] + #[test_case(vec![0u16, 2u16] => Frequency(0.5f32) ; "Mixed")] + fn ecdna_frequency(ecdna: Vec) -> Frequency { + EcDNADistribution::from(ecdna).frequency().unwrap() + } + + #[test] + #[should_panic] + fn ecdna_frequency_empty() { + EcDNADistribution::from(vec![]).frequency().unwrap(); + } + + // #[test_case(vec![0u16, 1u16, 2u16] => Mean(1f32) ; "Balanced input")] + // #[test_case(vec![1u16, 1u16, 1u16] => Mean(1f32) ; "Constant input")] + // #[test_case(vec![0u16, 2u16] => Mean(1f32) ; "Unbalanced input")] + // fn ecdna_mean_1(ecdna: Vec) -> Mean { + // EcDNADistribution::from(ecdna).mean() + // } + + #[test] + #[should_panic] + fn ecdna_entropy_empty() { + EcDNADistribution::from(vec![]).entropy().unwrap(); + } + + #[test] + #[should_panic] + fn data_try_from_empty_ecdna() { + Data::try_from(&EcDNADistribution::from(vec![])).unwrap(); + } + + #[test] + #[should_panic] + fn ecdna_distribution_undersampling_empty() { + let mut rng = SmallRng::from_entropy(); + let ecdna = EcDNADistribution::from(vec![]); + ecdna.undersample(&3u64, &mut rng); + } + + #[test] + #[should_panic] + fn ecdna_distribution_undersampling_more_cells_than_ecdna_distribution() { + let mut rng = SmallRng::from_entropy(); + let ecdna = EcDNADistribution::from(vec![10u16]); + ecdna.undersample(&3u64, &mut rng); + } + + impl fake::Dummy for EcDNADistribution { + fn dummy_with_rng( + _: &EcDNADistribution, + rng: &mut R, + ) -> Self { + let range = rand::distributions::Uniform::new(0, 20); + let mut distribution = + (0..19).map(|_| rng.sample(&range)).collect::>(); + distribution.push(0); + distribution.into() + } + } + + // Both `Clone` and `Debug` are required by `quickcheck` + #[derive(Debug, Clone)] + struct ValidEcDNADistributionFixture(pub EcDNADistribution); + + impl quickcheck::Arbitrary for ValidEcDNADistributionFixture { + fn arbitrary(_g: &mut Gen) -> Self { + // TODO should use g instead of rng but Gen does not seem to impl Rng trait?? + let mut rng = SmallRng::from_entropy(); + let ecdna = + EcDNADistribution::from(vec![]).fake_with_rng(&mut rng); + Self(ecdna) + } + } + + #[quickcheck] + fn ecdna_distribution_undersampling_check_cells_tot( + valid_ecdna: ValidEcDNADistributionFixture, + ) -> bool { + let mut rng = SmallRng::from_entropy(); + valid_ecdna.0.undersample(&8u64, &mut rng).nb_cells() == 8u64 + } + + #[quickcheck] + fn ecdna_distribution_undersampling_check_copies( + valid_ecdna: ValidEcDNADistributionFixture, + ) -> bool { + let mut rng = SmallRng::from_entropy(); + valid_ecdna + .0 + .undersample(&8u64, &mut rng) + .copies() + .is_subset(&valid_ecdna.0.copies()) + } + + #[test] + fn ecdna_distribution_undersampling_check_copies_one_copy() { + let mut rng = SmallRng::from_entropy(); + let distribution = HashMap::from([(10u16, 2u64)]); + let ntot = distribution.values().sum(); + let ecdna = EcDNADistribution { distribution, ntot }; + + assert_eq!( + ecdna.undersample(&1u64, &mut rng).copies(), + HashSet::from([10u16]), + ); + + assert_eq!( + ecdna.undersample(&1u64, &mut rng), + EcDNADistribution::from(vec![10u16]) + ); + } + + #[test] + fn ecdna_distribution_undersampling_check_with_nminus_empty() { + let mut rng = SmallRng::from_entropy(); + let distribution = HashMap::from([(0u16, 10u64)]); + let ecdna = EcDNADistribution::from(distribution); + assert_eq!( + ecdna.undersample(&1u64, &mut rng), + EcDNADistribution::from(vec![0u16]) + ); + } + + #[test] + fn ecdna_distribution_undersampling_check_copies_one_copy_two_samples() { + let mut rng = SmallRng::from_entropy(); + let distribution = HashMap::from([(10u16, 2u64)]); + let ntot = distribution.values().sum(); + let ecdna = EcDNADistribution { distribution, ntot }; + assert_eq!( + ecdna.undersample(&2u64, &mut rng).copies(), + HashSet::from([10u16]) + ); + assert_eq!( + ecdna.undersample(&2u64, &mut rng), + EcDNADistribution::from(vec![10u16, 10u16]) + ); + } + + #[test] + fn ecdna_distribution_undersampling_check_likelihood() { + let mut rng = SmallRng::from_entropy(); + let distribution = HashMap::from([(10u16, 10000u64), (1u16, 1u64)]); + let ntot = distribution.values().sum(); + let ecdna = EcDNADistribution { distribution, ntot }; + assert_eq!( + ecdna.undersample(&1u64, &mut rng).copies(), + HashSet::from([10u16]) + ); + + assert_eq!( + ecdna.undersample(&1u64, &mut rng), + EcDNADistribution::from(vec![10u16]) + ); + } + + #[test] + fn ecdna_ks_distance_empty() { + let x = EcDNADistribution::from(vec![]); + let y = EcDNADistribution::from(vec![]); + assert_eq!(x.ks_distance(&y), (1f32, false)); + assert_eq!(y.ks_distance(&x), (1f32, false)); + assert_eq!(x.ks_distance(&x), (1f32, false)); + } + + #[test] + fn ecdna_ks_distance_small_samples() { + let x = EcDNADistribution::from(vec![1, 2]); + let y = EcDNADistribution::from(vec![1, 2]); + assert_eq!(x.ks_distance(&y), (1f32, false)); + assert_eq!(y.ks_distance(&x), (1f32, false)); + assert_eq!(x.ks_distance(&x), (1f32, false)); + } + + #[quickcheck] + fn ecdna_ks_distance_same_data(x: ValidEcDNADistributionFixture) -> bool { + let (distance, _) = x.0.ks_distance(&x.0); + (distance - 0f32).abs() <= f32::EPSILON + } + + #[quickcheck] + fn ecdna_ks_distance_max_copy_number_u16( + mut x: ValidEcDNADistributionFixture, + y: ValidEcDNADistributionFixture, + ) -> bool { + x.0.distribution.insert(u16::MAX, 3u64); + let (_, convergence) = x.0.ks_distance(&y.0); + convergence + } + + #[quickcheck] + fn ecdna_ks_distance_same_data_shifted_by_1( + x: ValidEcDNADistributionFixture, + ) -> bool { + // https://github.com/daithiocrualaoich/kolmogorov_smirnov/blob/master/src/test.rs#L474 + let y_min = *x.0.distribution.keys().max().unwrap() + 1; + let y: EcDNADistribution = + x.0.distribution + .iter() + .map(|(k, val)| (y_min + *k, *val)) + .collect::>() + .into(); + + assert_eq!(y.ntot, 20); + let (distance, convergence) = x.0.ks_distance(&y); + println!("{}", distance); + (distance - 1f32).abs() <= f32::EPSILON && convergence + } + + #[quickcheck] + fn ecdna_ks_distance_same_data_shifted_by_1_added_support( + x: ValidEcDNADistributionFixture, + ) -> bool { + // https://github.com/daithiocrualaoich/kolmogorov_smirnov/blob/master/src/test.rs#L474 + let y_min = *x.0.distribution.keys().max().unwrap() + 1; + // Add all the original items back too. + let mut y: HashMap = + HashMap::with_capacity(2usize * x.0.ntot as usize); + + for (k, v) in x.0.distribution.iter() { + y.insert(*k + y_min, *v); + y.insert(*k, *v); + } + + let y: EcDNADistribution = y.into(); + + assert_eq!(y.ntot, x.0.ntot * 2); + let (distance, convergence) = x.0.ks_distance(&y); + (distance - 0.5f32).abs() <= f32::EPSILON && convergence + } + + #[quickcheck] + fn ecdna_ks_distance_is_one_div_length_for_sample_with_additional_low_value( + x: ValidEcDNADistributionFixture, + ) -> bool { + // Add a extra sample of early weight to ys. + let mut ys = x.clone(); + let nminus = ys.0.distribution.entry(0u16).or_insert(0u64); + *nminus += 1; + ys.0.ntot += 1; + assert_eq!(ys.0.ntot, x.0.ntot + 1); + + let (distance, convergence) = x.0.ks_distance(&ys.0); + + let expected = match x.0.distribution.get(&0u16) { + Some(©) => { + (copy as f32 + 1.0) / ys.0.ntot as f32 + - (copy as f32) / (x.0.ntot as f32) + } + None => 1.0 / ys.0.ntot as f32, + }; + + (distance - expected).abs() <= f32::EPSILON && convergence + } +} diff --git a/ecdna-evo/src/dynamics.rs b/ecdna-evo/src/dynamics.rs index 9f7f0f7..f5e72e0 100644 --- a/ecdna-evo/src/dynamics.rs +++ b/ecdna-evo/src/dynamics.rs @@ -6,9 +6,9 @@ //! To add a new dynamical quantity, register it in the enum `Dynamic` and //! implement the trait `Name` and `Update` and modify `Dynamic::save`. +use crate::data::{write2file, ToFile}; use crate::gillespie; -use crate::run::InitializedRun; -use crate::simulation::{write2file, ToFile}; +use crate::run::{Run, Started}; use crate::{GillespieTime, NbIndividuals, Parameters}; use enum_dispatch::enum_dispatch; use std::ops::{Deref, DerefMut}; @@ -26,8 +26,9 @@ use std::path::Path; /// with any copy of ecDNA per iteration: /// /// ```no_run -/// use ecdna_evo::run::InitializedRun; -/// use ecdna_evo::{NbIndividuals, Update}; +/// use ecdna_evo::dynamics::Update; +/// use ecdna_evo::run::{Run, Started}; +/// use ecdna_evo::NbIndividuals; /// /// pub struct NPlus { /// /// Record the number of cells w/ ecDNA for each iteration. @@ -35,7 +36,7 @@ use std::path::Path; /// } /// /// impl Update for NPlus { -/// fn update(&mut self, run: &InitializedRun) { +/// fn update(&mut self, run: &Run) { /// self.nplus_dynamics.push(run.get_nplus()); /// } /// } @@ -46,7 +47,7 @@ pub trait Update { /// Update the measurement based on the `run` for each iteration, i.e. /// defines how to interact with `Run` to update the quantity of interest /// for each iteration. - fn update(&mut self, run: &InitializedRun); + fn update(&mut self, run: &Run); } #[derive(Clone, Default, Debug)] @@ -141,13 +142,13 @@ pub struct NPlus { impl ToFile for NPlus { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.nplus_dynamics, path2file, None)?; + write2file(&self.nplus_dynamics, path2file, None, false)?; Ok(()) } } impl Update for NPlus { - fn update(&mut self, run: &InitializedRun) { + fn update(&mut self, run: &Run) { self.nplus_dynamics.push(run.get_nplus()); } } @@ -160,12 +161,10 @@ impl Name for NPlus { impl NPlus { pub fn new(parameters: &Parameters) -> Self { - let mut nplus_dynamics = Vec::with_capacity(parameters.max_cells as usize); + let mut nplus_dynamics = + Vec::with_capacity(parameters.max_cells as usize); nplus_dynamics.push(parameters.init_nplus as u64); - NPlus { - nplus_dynamics, - name: "nplus".to_string(), - } + NPlus { nplus_dynamics, name: "nplus".to_string() } } } @@ -179,13 +178,13 @@ pub struct NMinus { impl ToFile for NMinus { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.nminus_dynamics, path2file, None)?; + write2file(&self.nminus_dynamics, path2file, None, false)?; Ok(()) } } impl Update for NMinus { - fn update(&mut self, run: &InitializedRun) { + fn update(&mut self, run: &Run) { self.nminus_dynamics.push(*run.get_nminus()); } } @@ -198,12 +197,10 @@ impl Name for NMinus { impl NMinus { pub fn new(parameters: &Parameters) -> Self { - let mut nminus_dynamics = Vec::with_capacity(parameters.max_cells as usize); + let mut nminus_dynamics = + Vec::with_capacity(parameters.max_cells as usize); nminus_dynamics.push(parameters.init_nminus); - NMinus { - nminus_dynamics, - name: "nminus".to_string(), - } + NMinus { nminus_dynamics, name: "nminus".to_string() } } } @@ -219,12 +216,9 @@ impl MeanDyn { let mut mean = Vec::with_capacity(parameters.max_iter); mean.push((parameters.init_nplus as u16) as f32); - MeanDyn { - mean, - name: "mean_dynamics".to_string(), - } + MeanDyn { mean, name: "mean_dynamics".to_string() } } - pub fn ecdna_distr_mean(&self, run: &InitializedRun) -> f32 { + pub fn ecdna_distr_mean(&self, run: &Run) -> f32 { //! The mean of the ecDNA distribution for the current iteration. let ntot = run.get_nminus() + run.get_nplus(); match gillespie::fast_mean_computation( @@ -241,13 +235,13 @@ impl MeanDyn { impl ToFile for MeanDyn { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.mean, path2file, None)?; + write2file(&self.mean, path2file, None, false)?; Ok(()) } } impl Update for MeanDyn { - fn update(&mut self, run: &InitializedRun) { + fn update(&mut self, run: &Run) { self.mean.push(self.ecdna_distr_mean(run)); } } @@ -268,7 +262,7 @@ pub struct Variance { } impl Update for Variance { - fn update(&mut self, run: &InitializedRun) { + fn update(&mut self, run: &Run) { let mean = self.mean.ecdna_distr_mean(run); self.mean.update(run); self.variance.push(run.variance_ecdna(&mean)); @@ -287,17 +281,13 @@ impl Variance { variance.push(0f32); let mean = MeanDyn::new(parameters); - Variance { - variance, - mean, - name: "var_dynamics".to_string(), - } + Variance { variance, mean, name: "var_dynamics".to_string() } } } impl ToFile for Variance { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.variance, path2file, None)?; + write2file(&self.variance, path2file, None, false)?; Ok(()) } } @@ -311,7 +301,7 @@ pub struct GillespieT { } impl Update for GillespieT { - fn update(&mut self, run: &InitializedRun) { + fn update(&mut self, run: &Run) { self.time .push(self.time.last().unwrap() + run.get_gillespie_event().time); } @@ -327,16 +317,13 @@ impl GillespieT { pub fn new(parameters: &Parameters) -> Self { let mut time = Vec::with_capacity(parameters.max_cells as usize); time.push(0f32); - GillespieT { - time, - name: "time".to_string(), - } + GillespieT { time, name: "time".to_string() } } } impl ToFile for GillespieT { fn save(&self, path2file: &Path) -> anyhow::Result<()> { - write2file(&self.time, path2file, None)?; + write2file(&self.time, path2file, None, false)?; Ok(()) } } diff --git a/ecdna-evo/src/gillespie.rs b/ecdna-evo/src/gillespie.rs index c262070..3dc2e37 100644 --- a/ecdna-evo/src/gillespie.rs +++ b/ecdna-evo/src/gillespie.rs @@ -57,7 +57,10 @@ impl Rate { [rate] => Rate::Scalar(rate), [min, max] => Rate::Range(Range::new(min, max)), _ => { - panic!("Cannot create Rate with more than two rates {:#?}", rates) + panic!( + "Cannot create Rate with more than two rates {:#?}", + rates + ) } } } @@ -106,10 +109,7 @@ struct Range { impl Default for Range { fn default() -> Self { - Range { - min: 0f32, - max: 1f32, - } + Range { min: 0f32, max: 1f32 } } } @@ -166,9 +166,7 @@ impl DeathRate { impl std::str::FromStr for ProliferationRate { type Err = std::num::ParseFloatError; fn from_str(s: &str) -> Result { - f32::from_str(s).map(|val| ProliferationRate { - 0: Rate::Scalar(val), - }) + f32::from_str(s).map(|val| ProliferationRate { 0: Rate::Scalar(val) }) } } @@ -210,9 +208,7 @@ impl Mul for ProliferationRate { impl std::str::FromStr for DeathRate { type Err = std::num::ParseFloatError; fn from_str(s: &str) -> Result { - f32::from_str(s).map(|val| DeathRate { - 0: Rate::Scalar(val), - }) + f32::from_str(s).map(|val| DeathRate { 0: Rate::Scalar(val) }) } } @@ -519,7 +515,8 @@ impl BirthDeathProcess { "Found negative death_rate_nminus should be positive!" ); let death_rate1_found = d1 > 0f32; - let exactly_one_death_rate_found = (death_rate1_found).bitxor(d2 > 0_f32); + let exactly_one_death_rate_found = + (death_rate1_found).bitxor(d2 > 0_f32); if exactly_one_death_rate_found { if death_rate1_found { @@ -530,12 +527,7 @@ impl BirthDeathProcess { process.into() } } else if death_rate1_found { - let process = BirthDeath { - r1: f1, - r2: f2, - d1, - d2, - }; + let process = BirthDeath { r1: f1, r2: f2, d1, d2 }; process.into() } else { let process = PureBirth { r1: f1, r2: f2 }; @@ -543,14 +535,22 @@ impl BirthDeathProcess { } } - pub fn gillespie(&self, pop1: NbIndividuals, pop2: NbIndividuals) -> Event { + pub fn gillespie( + &self, + pop1: NbIndividuals, + pop2: NbIndividuals, + ) -> Event { //! Determine the next `Event` using the Gillespie algorithm. assert!((pop1 + pop2) > 0u64); let (kind, time) = self.next_event(pop1, pop2); Event { kind, time } } - fn next_event(&self, pop1: NbIndividuals, pop2: NbIndividuals) -> (AdvanceRun, GillespieTime) { + fn next_event( + &self, + pop1: NbIndividuals, + pop2: NbIndividuals, + ) -> (AdvanceRun, GillespieTime) { let (times, events) = self.compute_times_events(pop1, pop2); // Find the event that will occur next, corresponding to the smaller @@ -570,7 +570,7 @@ impl BirthDeathProcess { fn exprand(lambda: GillespieRate) -> f32 { //! Generates a random waiting time using the exponential waiting time with //! parameter `lambda` of Poisson StochasticProcess. - if abs_diff_eq!(lambda, 0_f32) { + if (lambda - 0_f32).abs() < f32::EPSILON { f32::INFINITY } else { // random number between (0, 1) @@ -607,7 +607,9 @@ pub fn fast_mean_computation( match event { AdvanceRun::Init => panic!("Cannot compute mean with Init event"), // The mean will be n_old * mean_old / (n_old + 1) - AdvanceRun::Proliferate2 => Some(previous_mean * ntot as f32 / (ntot as f32 + 1_f32)), + AdvanceRun::Proliferate2 => { + Some(previous_mean * ntot as f32 / (ntot as f32 + 1_f32)) + } // If the next event is the death of a NMinus cell there are two possible outcomes based // on the number of n cells in the previous iteration: 1. in the previous there was only @@ -649,7 +651,7 @@ mod tests { let lambda: GillespieRate = 0.3; let first = exprand(lambda); let second = exprand(lambda); - assert_abs_diff_ne!(first, second); + assert!((first - second).abs() > f32::EPSILON); let lambda: GillespieRate = 0_f32; let first = exprand(lambda); @@ -666,12 +668,20 @@ mod tests { let nb_individuals = 1f32; let next_event = AdvanceRun::Proliferate2; assert_eq!( - fast_mean_computation(previous_mean, &next_event, nb_individuals as u64), + fast_mean_computation( + previous_mean, + &next_event, + nb_individuals as u64 + ), Some(0.5f32) ); let next_event = AdvanceRun::Proliferate1; assert_eq!( - fast_mean_computation(previous_mean, &next_event, nb_individuals as u64), + fast_mean_computation( + previous_mean, + &next_event, + nb_individuals as u64 + ), None ); } @@ -681,37 +691,47 @@ mod tests { #[test] #[should_panic] fn test_gillespie_panics_with_no_individuals_lefth() { - let my_process = BirthDeathProcess::new(&Rates::new(&[1.], &[1.], &[0.], &[0.])); + let my_process = + BirthDeathProcess::new(&Rates::new(&[1.], &[1.], &[0.], &[0.])); my_process.gillespie(0, 0); } #[test] fn test_gillespie_returns_proliferate1() { - let my_process = BirthDeathProcess::new(&Rates::new(&[1.], &[0.], &[0.], &[0.])); + let my_process = + BirthDeathProcess::new(&Rates::new(&[1.], &[0.], &[0.], &[0.])); assert_eq!(AdvanceRun::Proliferate1, my_process.gillespie(1, 2).kind); } #[test] fn test_gillespie_returns_highly_probably_proliferate1() { - let my_process = BirthDeathProcess::new(&Rates::new(&[10000.], &[0.], &[0.01], &[0.])); + let my_process = BirthDeathProcess::new(&Rates::new( + &[10000.], + &[0.], + &[0.01], + &[0.], + )); assert_eq!(AdvanceRun::Proliferate1, my_process.gillespie(1, 2).kind); } #[test] fn test_gillespie_returns_highly_probably_proliferate2() { - let my_process = BirthDeathProcess::new(&Rates::new(&[0.], &[1.], &[0.0], &[0.])); + let my_process = + BirthDeathProcess::new(&Rates::new(&[0.], &[1.], &[0.0], &[0.])); assert_eq!(AdvanceRun::Proliferate2, my_process.gillespie(1, 2).kind); } #[test] fn test_gillespie_returns_die1() { - let my_process = BirthDeathProcess::new(&Rates::new(&[0.], &[0.], &[1.], &[0.])); + let my_process = + BirthDeathProcess::new(&Rates::new(&[0.], &[0.], &[1.], &[0.])); assert_eq!(my_process.gillespie(1, 2).kind, AdvanceRun::Die1); } #[test] fn test_gillespie_returns_die2() { - let my_process = BirthDeathProcess::new(&Rates::new(&[0.], &[0.], &[0.], &[1.])); + let my_process = + BirthDeathProcess::new(&Rates::new(&[0.], &[0.], &[0.], &[1.])); assert_eq!(AdvanceRun::Die2, my_process.gillespie(1, 2).kind); } diff --git a/ecdna-evo/src/lib.rs b/ecdna-evo/src/lib.rs index b659886..eafcb6c 100644 --- a/ecdna-evo/src/lib.rs +++ b/ecdna-evo/src/lib.rs @@ -12,7 +12,8 @@ //! # Simulation example //! The simulation of tumour growth using Gillespie algorithm. //! ```no_run -//! use ecdna_evo::{Dynamic, Dynamics, Parameters, Rates, Simulation}; +//! use ecdna_evo::dynamics::{Dynamic, Dynamics}; +//! use ecdna_evo::{Parameters, Rates, Simulation}; //! //! // Configure the simulation with parameters and rates //! // 1. parameters such as the number of iterations, number of cells @@ -20,7 +21,7 @@ //! // 2. rates of the process: the neutral dynamics (no selection) by default //! let rates = Rates::default(); //! -//! // Define the quantities to be simulated +//! // Define the quantities to be simulated (this is optional) //! let dynamics = Dynamics::from(vec![Dynamic::new(¶ms, "nplus")]); //! //! // Run the simulation @@ -42,13 +43,13 @@ //! //! // Load the patient's data used to run ABC, in this case only the ecDNA //! // distribution -//! let verbosity = 0u8; //! let patient = Patient::load( //! PatientPathsBuilder::default() //! .distribution("path2ecdna_distribution") //! .build() //! .unwrap(), -//! verbosity, +//! "PatientName", +//! params.verbosity, //! ); //! //! // Run the ABC inference to determine the parameters for the `patient` @@ -64,15 +65,14 @@ pub mod simulation; #[doc(inline)] pub use crate::abc::Patient; #[doc(inline)] -pub use crate::dynamics::{Dynamic, Dynamics, Update}; -#[doc(inline)] pub use crate::gillespie::{GillespieTime, NbIndividuals, Rates}; #[doc(inline)] -pub use crate::run::{DNACopy, EcDNADistribution, Parameters, Run}; +pub use crate::run::{DNACopy, Parameters, Run}; #[doc(inline)] -pub use crate::simulation::{Simulation, ToFile}; +pub use crate::simulation::Simulation; -#[macro_use] -extern crate approx; #[macro_use] extern crate derive_builder; +#[cfg(test)] +#[macro_use(quickcheck)] +extern crate quickcheck_macros; diff --git a/ecdna-evo/src/run.rs b/ecdna-evo/src/run.rs index 462c61b..21a78c8 100644 --- a/ecdna-evo/src/run.rs +++ b/ecdna-evo/src/run.rs @@ -1,13 +1,14 @@ use crate::abc::ABCRejection; -use crate::data::{Data, EcDNA, Entropy, Frequency, Mean}; +use crate::data::{Data, EcDNADistribution, Entropy, Frequency, Mean, ToFile}; use crate::dynamics::{Dynamics, Name, Update}; -use crate::gillespie::{AdvanceRun, BirthDeathProcess, Event, GetRates, GillespieTime}; -use crate::{NbIndividuals, Patient, Rates, ToFile}; -use rand::thread_rng; -use rand::Rng; +use crate::gillespie::{ + AdvanceRun, BirthDeathProcess, Event, GetRates, GillespieTime, +}; +use crate::{NbIndividuals, Patient, Rates}; +use rand::rngs::SmallRng; +use rand::{thread_rng, Rng}; use rand_distr::{Binomial, Distribution}; use std::collections::HashMap; -use std::ops::Deref; use std::path::{Path, PathBuf}; /// Number of ecDNA copies within a cell. We assume that a cell cannot have more @@ -16,137 +17,103 @@ pub type DNACopy = u16; /// Simulation of an exponentially growing tumour, that is one realization of /// the stochastic birth-death process. -pub struct Run { +/// +/// The `Run` uses the [typestate pattern]. The possible states are [`Started`] +/// and [`Ended`]. +/// +/// [typestate pattern]: https://github.com/cbiffle/m4vga-rs/blob/a1e2ba47eaeb4864f0d8b97637611d9460ce5c4d/notes/20190131-typestate.md +pub struct Run { + state: S, + /// Index of the run pub idx: usize, - /// Data of interest - pub data: Data, - /// Rates: the proliferation and death rates of the cells `NPlus` and - /// `NMninus` cells respectively. - rates: [f32; 4], /// Parameters to configure the run - parameters: Parameters, + pub parameters: Parameters, +} + +/// The simulation of the run has started, the stochastic birth-death process +/// has started looping over the iterations. +pub struct Started { + /// Stochastic process simulating tumour growth + process: BirthDeathProcess, + /// State of the system at one particular iteration + system: System, +} + +/// The simulation of the run has ended, which is ready to be saved. +pub struct Ended { + #[allow(dead_code)] // TODO restart run with two timepoints /// Gillespie time at the end of the run gillespie_time: GillespieTime, + #[allow(dead_code)] // TODO restart run with two timepoints /// Stopping condition stop: EndRun, + /// Data of interest for the last iteration + data: Data, + /// Rates: the proliferation and death rates of the cells `NPlus` and + /// `NMninus` cells respectively. + rates: [f32; 4], + /// The run idx from which the sample was taken. + sampled_run: Option, } -impl Run { - pub fn new(idx: usize, parameters: Parameters, rates: &Rates) -> InitializedRun { - //! Initialize the run with the `parameters` and the proliferation and - //! death rates. +pub trait RunState {} +impl RunState for Started {} +impl RunState for Ended {} - InitializedRun { +impl Run { + pub fn new(idx: usize, parameters: Parameters, rates: &Rates) -> Self { + //! Use the `parameters` and the `rates` to initialize a realization of + //! a birth-death stochastic process. + + Run { idx, - process: BirthDeathProcess::new(rates), - state: State { - nminus: parameters.init_nminus, - ecdna_distr: EcDNADistributionNPlus::new(¶meters), - event: Event { - kind: AdvanceRun::Init, - time: parameters.init_time, + parameters, + state: Started { + process: BirthDeathProcess::new(rates), + system: System { + nminus: parameters.init_nminus, + ecdna_distr: EcDNADistributionNPlus::new(¶meters), + event: Event { + kind: AdvanceRun::Init, + time: parameters.init_time, + }, }, }, - parameters, - } - } - - pub fn save( - self, - abspath: &Path, - dynamics: &Option, - patient: &Option, - ) -> SavedRun { - //! Save the run the dynamics (updated for each iter) if present and the - //! other quantities, computed at the end of the run such as the mean, - //! frequency. - let (idx, filename) = (self.idx, self.filename()); - let (abspath_d, abspath_abc) = (abspath.join("dynamics"), abspath.join("abc")); - - // 1. dynamics - if let Some(ref dynamics) = dynamics { - assert!(patient.is_none(), "Cannot have patient and dynamics"); - for d in dynamics.iter() { - let file2path = abspath_d.join(d.get_name()).join(filename.clone()); - d.save(&file2path).unwrap(); - } - } - - // save the ecDNA distribution, mean, entropy and the frequency for the - // last iteration. Do not save ecdna with abc program - self.data.save(abspath, &filename, patient.is_none()); - let saved_succeeded = { - // save but handle the case of abc where we save only - // if runs are similar - if let Some(patient) = &patient { - // ABC save the rates i.e. the proliferation and - // death rates - let (rates, filename) = (self.rates, self.filename()); - let results = ABCRejection::run(self, patient); - results.save(&filename, &abspath_abc, &rates) - } else { - true - } - }; - - SavedRun { - saved: saved_succeeded, - idx, } } - pub fn restart(self) -> InitializedRun { - //! Restart the simulation of the run setting the initial state to the - //! last state of `self`. - todo!() - } - - fn filename(&self) -> PathBuf { - //! File path for the current run (used to save data) - PathBuf::from(format!("{}.csv", self.idx)) - } -} - -pub struct InitializedRun { - /// Index of the run - pub idx: usize, - /// Stochastic process simulating tumour growth - process: BirthDeathProcess, - /// State of the system at one particular iteration - state: State, - /// Parameters to configure the run - parameters: Parameters, -} - -impl InitializedRun { - pub fn new(idx: usize, parameters: Parameters, rates: &Rates) -> Self { - let process: BirthDeathProcess = BirthDeathProcess::new(rates); - todo!() - } - pub fn get_nminus(&self) -> &NbIndividuals { - &self.state.nminus + //! Number of cells w/o any ecDNA copies for the current iteration. + &self.state.system.nminus } pub fn get_nplus(&self) -> NbIndividuals { - self.state.get_nplus_cells() + //! Number of cells w at least one ecDNA copy for the current iteration. + self.state.system.get_nplus_cells() } pub fn mean_ecdna(&self) -> f32 { + //! Average of ecDNA copy number distribution within the population for + //! the current iteration. self.state + .system .ecdna_distr .compute_mean(&(self.get_nplus() + self.get_nminus())) } pub fn variance_ecdna(&self, mean: &f32) -> f32 { - if self.state.ecdna_distr.is_empty() { - panic!("Cannot compute the variance of an empty ecDNA distribution") + //! Variance of ecDNA copy number distribution within the population for + //! the current iteration. + if self.state.system.ecdna_distr.is_empty() { + panic!( + "Cannot compute the variance of an empty ecDNA distribution" + ) } else { let nb_nplus = self.get_nplus(); let nb_nminus = self.get_nminus(); - self.state.ecdna_distr[..] + self.state.system.ecdna_distr[..] .iter() .chain(std::iter::repeat(&0u16).take(*nb_nminus as usize)) .map(|value| { @@ -159,15 +126,14 @@ impl InitializedRun { } pub fn get_gillespie_event(&self) -> &Event { - &self.state.event + &self.state.system.event } - pub fn simulate(mut self, mut dynamics: Option) -> Run { - //! Simulate the run (tumour growth) looping until the stop conditions - //! are reached. If the `measurement` overrides the `update` method in - //! `Measurable`, then it gets updated during the loop for each - //! iteration based on the state of the system at the current - //! iteration. + pub fn simulate(mut self, mut dynamics: Option) -> Run { + //! Simulate one realisation of the birth-death stochastic process. + //! + //! If the some `dynamics` are given, those quantities will be + //! calculated using the [`Update`] method. let mut iter = self.parameters.init_iter; let mut nplus = self.parameters.init_nplus as u64; let mut nminus = self.parameters.init_nminus; @@ -176,7 +142,7 @@ impl InitializedRun { loop { // Compute the next event using Gillespie algorithm based on the // stochastic process defined by `process` - let event = self.process.gillespie(nplus, nminus); + let event = self.state.process.gillespie(nplus, nminus); self.update(event, &mut dynamics); nplus = self.get_nplus(); nminus = *self.get_nminus(); @@ -189,54 +155,119 @@ impl InitializedRun { // (self.max_cells), i.e. when the iteration has // generated a tumor of self.max_cells size if nplus + nminus == 0u64 { - break (self.state.event.time, EndRun::NoIndividualsLeft); + break ( + self.state.system.event.time, + EndRun::NoIndividualsLeft, + ); }; if iter >= self.parameters.max_iter - 1usize { - break (self.state.event.time, EndRun::MaxItersReached); + break ( + self.state.system.event.time, + EndRun::MaxItersReached, + ); }; if nplus + nminus >= self.parameters.max_cells { - break (self.state.event.time, EndRun::MaxIndividualsReached); + break ( + self.state.system.event.time, + EndRun::MaxIndividualsReached, + ); } } }; + let ntot = nminus + nplus; - if let BirthDeathProcess::PureBirth(_) = &self.process { + + if let BirthDeathProcess::PureBirth(_) = &self.state.process { assert!(ntot > 0, "No cells found with PureBirth process") } - let mean = Mean(self.state.ecdna_distr.compute_mean(&ntot)); + let (idx, parameters, rates) = + (self.idx, self.parameters, self.state.process.get_rates()); + + let data = self.create_data(&ntot, &condition); - if nplus > 0 { - assert!(mean.0 > 0f32) + Run { + idx, + parameters, + state: Ended { + data, + rates, + gillespie_time: time, + stop: condition, + sampled_run: None, + }, } + } - let entropy = Entropy(self.state.ecdna_distr.compute_entropy(&ntot)); - let (ecdna, frequency) = ( - EcDNA( - self // add nminus cells to ecDNA distribution - .state - .ecdna_with_nminus_cells(), - ), - Frequency((nplus as f32) / (ntot as f32)), - ); + fn create_data( + self, + ntot: &NbIndividuals, + stop_condition: &EndRun, + ) -> Data { + let ecdna_distr = self.state.system.ecdna_distr.0; + let nplus = ecdna_distr.len() as NbIndividuals; + let mut counts: HashMap = HashMap::new(); + + if ecdna_distr.is_empty() { + if ntot == &0u64 { + match stop_condition { + EndRun::NoIndividualsLeft => { + // no cells left, return NAN + return Data { + ecdna: EcDNADistribution::from(counts), + mean: Mean(f32::NAN), + frequency: Frequency(f32::NAN), + entropy: Entropy(f32::NAN), + }; + } + _ => { + panic!( + "Found wrong value of ntot: found ntot = 0 but stop condition {:#?} instead of NoIndividualsLeft", + stop_condition + ); + } + } + } else { + // no nplus cells left, return 0 + return Data { + ecdna: EcDNADistribution::from(counts), + mean: Mean(0f32), + frequency: Frequency(0f32), + entropy: Entropy(0f32), + }; + } + } else if ntot == &0 { + panic!("Found wrong value of ntot: cannot create data when ntot is 0 and the ecDNA distribution is not empty") + } else if ntot < &(ecdna_distr.len() as NbIndividuals) { + panic!("Found wrong value of ntot: should be equal or greater than the number of cells w/ ecDNA, found smaller") + } - Run { - idx: self.idx, - data: Data::new(ecdna, mean, frequency, entropy), - rates: self.process.get_rates(), - parameters: self.parameters, - gillespie_time: time, - stop: condition, + let mut sum = 0u64; + for ecdna in ecdna_distr.into_iter() { + sum += ecdna as u64; + *counts.entry(ecdna).or_default() += 1; } + // add nminus cells + counts.insert(0u16, ntot - nplus); + + let ecdna = EcDNADistribution::from(counts); + assert_eq!(ecdna.nb_cells(), *ntot); + let entropy = Entropy::try_from(&ecdna).unwrap(); + // do not try_from to avoid traversing the distribution vector + let mean = Mean(sum as f32 / ecdna.nb_cells() as f32); + let frequency = Frequency::try_from(&ecdna).unwrap(); + + Data { ecdna, mean, frequency, entropy } } fn update(&mut self, next_event: Event, dynamics: &mut Option) { - //! Update the run according to the `next_event` sampled from Gillespie - //! algorithm. This updates also the measurements. + //! Update the run for the next iteration, according to the `next_event` + //! sampled from Gillespie algorithm. This updates also the `dynamics` + //! if present. if let AdvanceRun::Init = next_event.kind { panic!("Init state can only be used to initialize simulation!") } - self.state.update(next_event); + self.state.system.update(next_event); if let Some(dynamics) = dynamics { for d in dynamics.iter_mut() { // updates all dynamics according to the new state @@ -246,26 +277,135 @@ impl InitializedRun { } } -pub struct SavedRun { - /// Index of the run - pub idx: usize, - saved: bool, -} +impl Run { + pub fn save( + self, + abspath: &Path, + dynamics: &Option, + patient: &Option, + ) -> anyhow::Result<()> { + //! Save the run the dynamics (updated for each iter) if present and the + //! other quantities, computed at the end of the run such as the mean, + //! frequency. + //! + //! Do not save the full ecDNA distribution when ABC. + // idx of the run + let abspath_d = abspath.join("dynamics"); -impl SavedRun { - pub fn is_saved(self) -> bool { - self.saved + // 1. dynamics + if let Some(ref dynamics) = dynamics { + assert!(patient.is_none(), "Cannot have patient and dynamics"); + for d in dynamics.iter() { + let file2path = + abspath_d.join(d.get_name()).join(self.filename()); + d.save(&file2path).unwrap(); + } + } + + let abspath = { + if let Some(patient) = &patient { + abspath.join(patient.name.clone()) + } else { + abspath.to_owned() + } + }; + + if let Some(patient) = &patient { + let results = ABCRejection::run(&self, patient); + results.save(&abspath, &self.parameters.subsample) + } else { + self.state.data.save( + &abspath, + &self.filename(), + &self.parameters.subsample, + ); + Ok(()) + } + } + + pub fn undersample_ecdna( + &self, + nb_cells: &NbIndividuals, + rng: &mut SmallRng, + idx: usize, + ) -> Self { + //! Returns a copy of the run with subsampled ecDNA distribution + let data = + self.state.data.ecdna.undersample_data(nb_cells, rng).unwrap(); + + assert_eq!( + &data.ecdna.nb_cells(), + nb_cells, + "Wrong undersampling of the ecDNA distribution: {} cells expected after sampling, found {}, {:#?}", nb_cells, data.ecdna.nb_cells(), data.ecdna + ); + + Run { + idx, + parameters: self.parameters, + state: Ended { + data, + rates: self.state.rates, + gillespie_time: self.state.gillespie_time, + stop: self.state.stop, + sampled_run: Some(self.idx), + }, + } + } + + pub fn get_rates(&self) -> [f32; 4] { + //! f1, f2, d1, d2 + self.state.rates + } + + pub fn get_nb_cells(&self) -> NbIndividuals { + self.state.data.ecdna.nb_cells() + } + + pub fn get_ecdna(&self) -> &EcDNADistribution { + //! The ecDNA distribution for the last iteration. + &self.state.data.ecdna + } + + pub fn get_mean(&self) -> &f32 { + //! The mean of ecDNA distribution for the last iteration. + &self.state.data.mean + } + + pub fn get_frequency(&self) -> &f32 { + //! The frequency of cells w/ ecDNA for the last iteration. + &self.state.data.frequency + } + + pub fn get_entropy(&self) -> &f32 { + //! The entropy of the ecDNA distribution for the last iteration. + &self.state.data.entropy + } + + pub fn get_parental_run(&self) -> &Option { + //! The idx for the sampled run + &self.state.sampled_run + } + + pub fn restart(self) -> Run { + //! Restart the simulation of the run setting the initial state to the + //! last state of `self`. + todo!() + } + + fn filename(&self) -> PathBuf { + //! File path for the current run (used to save data) + PathBuf::from(self.idx.to_string()) } } #[derive(Debug)] -struct State { +struct System { nminus: NbIndividuals, ecdna_distr: EcDNADistributionNPlus, event: Event, } -impl State { +impl System { fn update(&mut self, event: Event) { //! Update the state of the system based on event sampled from Gillespie match event.kind { @@ -302,33 +442,12 @@ impl State { fn get_nplus_cells(&self) -> NbIndividuals { self.ecdna_distr.get_nplus_cells() } - - pub fn ecdna_with_nminus_cells(self) -> EcDNADistribution { - self.ecdna_distr - .0 - .into_iter() - .chain(std::iter::repeat(0u16).take(self.nminus as usize)) - .collect::>() - .into() - } } /// The distribution of ecDNA copies without considering the cells w/o any ecDNA /// copy. #[derive(Clone, Debug, Default)] -pub struct EcDNADistributionNPlus(Vec); - -impl From for EcDNADistributionNPlus { - fn from(distr: EcDNADistribution) -> Self { - EcDNADistributionNPlus( - distr - .0 - .into_iter() - .filter(|©| copy > 0u16) - .collect::>(), - ) - } -} +struct EcDNADistributionNPlus(Vec); impl std::ops::Index for EcDNADistributionNPlus where @@ -357,42 +476,6 @@ impl EcDNADistributionNPlus { self.0.is_empty() } - fn compute_entropy(&self, ntot: &NbIndividuals) -> f32 { - //! Compute entropy with a `ntot` number of individuals - -compute_counts(&self.0) - .values() - .map(|&count| { - let prob: f32 = (count as f32) / (*ntot as f32); - prob * prob.log2() - }) - .sum::() - } - - fn compute_mean(&self, ntot: &NbIndividuals) -> f32 { - if self.is_empty() { - if ntot == &0u64 { - f32::NAN - } else { - 0f32 - } - } else if ntot == &0 { - panic!("Cannot compute the mean of an when ntot is 0") - } else if ntot < &self.get_nplus_cells() { - panic!("Found wrong value of ntot: should be equal or greater than the number of cells w/ ecDNA, found smaller") - } else { - let sum = self[..] - .iter() - .fold(0u64, |accum, &item| accum + (item as u64)) as f32; - assert!(sum > 0.0); - let mean = sum / (*ntot as f32); - assert!( - !(mean.is_nan()), - "Compute mean NaN from ecdna distribution vector" - ); - mean - } - } - fn get_nplus_cells(&self) -> NbIndividuals { self.0.len() as NbIndividuals } @@ -425,7 +508,8 @@ impl EcDNADistributionNPlus { //! `NMinus` cell is born, and the other daughter cell got all the //! available ec_dna. let idx = self.pick_nplus_cell(); - let (daughter1, daughter2, uneven_segregation) = self.dna_segregation(idx); + let (daughter1, daughter2, uneven_segregation) = + self.dna_segregation(idx); if uneven_segregation { self.0[idx] = daughter1 + daughter2; // n + 0 = n = 0 + n @@ -459,7 +543,8 @@ impl EcDNADistributionNPlus { // draw the ec_dna that will be given to the daughter cells let bin = Binomial::new(available_dna as u64, 0.5).unwrap(); - let d_1: DNACopy = bin.sample(&mut rand::thread_rng()).try_into().unwrap(); + let d_1: DNACopy = + bin.sample(&mut rand::thread_rng()).try_into().unwrap(); let d_2: DNACopy = available_dna - d_1; assert_ne!(d_1 + d_2, 0); assert!( @@ -473,23 +558,31 @@ impl EcDNADistributionNPlus { (d_1, d_2, uneven) } -} -/// The distribution of ecDNA copies considering the cells w/o any ecDNA copy. -#[derive(Clone, PartialEq, Debug, Default)] -pub struct EcDNADistribution(Vec); - -impl From> for EcDNADistribution { - fn from(distr: Vec) -> Self { - EcDNADistribution(distr) - } -} - -impl Deref for EcDNADistribution { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.0 + fn compute_mean(&self, ntot: &NbIndividuals) -> f32 { + if self.is_empty() { + if ntot == &0u64 { + f32::NAN + } else { + 0f32 + } + } else if ntot == &0 { + panic!("Cannot compute the mean of an when ntot is 0") + } else if ntot < &self.get_nplus_cells() { + panic!("Found wrong value of ntot: should be equal or greater than the number of cells w/ ecDNA, found smaller") + } else { + let sum = self[..] + .iter() + .fold(0u64, |accum, &item| accum + (item as u64)) + as f32; + assert!(sum > 0.0); + let mean = sum / (*ntot as f32); + assert!( + !(mean.is_nan()), + "Compute mean NaN from ecdna distribution vector" + ); + mean + } } } @@ -502,15 +595,6 @@ enum Cell { NMinus, } -fn compute_counts(data: &[u16]) -> HashMap<&u16, u64> { - //! Compute how many times the elements of data appear in data. From - //! `` - let mut counts = HashMap::new(); - data.iter() - .for_each(|item| *counts.entry(item).or_default() += 1); - counts -} - #[derive(Debug, Clone, Copy)] /// Parameters used to simulate tumour growth, they are shared across runs. pub struct Parameters { @@ -535,6 +619,8 @@ pub struct Parameters { /// Initial gillespie time from which start the simulation (always 0 except /// some data with two measurements at two different timepoints) pub init_time: f32, + /// How many nplus cells to take while subsampling the ecDNA distribution + pub subsample: Option, pub verbosity: u8, } @@ -552,6 +638,7 @@ impl Default for Parameters { verbosity: 0u8, init_iter: 0usize, init_time: 0f32, + subsample: None, } } } @@ -585,28 +672,22 @@ mod tests { use super::*; #[test] - fn test_ecdna_distribution_is_empty_with_nminus_cells() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; + fn ecdna_distribution_is_empty_with_nminus_cells() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; let ecdna = EcDNADistributionNPlus::new(&p); assert!(ecdna.is_empty()); } #[test] - fn test_ecdna_distribution_getter_when_empty() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; + fn ecdna_distribution_getter_when_empty() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; let ecdna = EcDNADistributionNPlus::new(&p); assert_eq!(ecdna.get_nplus_cells(), 0u64); assert!(ecdna.is_empty()); } #[test] - fn test_ecdna_distribution_from_default() { + fn ecdna_distribution_from_default() { let p = Parameters::default(); let ecdna = EcDNADistributionNPlus::new(&p); assert_eq!(ecdna.get_nplus_cells(), 1u64); @@ -614,11 +695,8 @@ mod tests { } #[test] - fn test_ecdna_distribution_from() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn ecdna_distribution_from() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let ecdna = EcDNADistributionNPlus::new(&p); assert_eq!(ecdna.get_nplus_cells(), 1); assert!(!ecdna.is_empty()); @@ -626,21 +704,15 @@ mod tests { #[test] #[should_panic] - fn test_ecdna_distribution_kill_nplus_empty() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; + fn ecdna_distribution_kill_nplus_empty() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; let mut ecdna = EcDNADistributionNPlus::new(&p); ecdna.kill_nplus(); } #[test] - fn test_ecdna_distribution_kill_nplus_one_cell() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn ecdna_distribution_kill_nplus_one_cell() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut ecdna = EcDNADistributionNPlus::new(&p); ecdna.kill_nplus(); assert!(ecdna.is_empty()); @@ -648,11 +720,8 @@ mod tests { } #[test] - fn test_ecdna_distribution_kill_nplus() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn ecdna_distribution_kill_nplus() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut ecdna = EcDNADistributionNPlus::new(&p); ecdna.kill_nplus(); assert!(ecdna.is_empty()); @@ -661,21 +730,15 @@ mod tests { #[test] #[should_panic] - fn test_ecdna_distribution_proliferate_nplus_empty() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; + fn ecdna_distribution_proliferate_nplus_empty() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; let mut ecdna = EcDNADistributionNPlus::new(&p); ecdna.proliferate_nplus(); } #[test] - fn test_ecdna_distribution_proliferate() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn ecdna_distribution_proliferate() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut ecdna = EcDNADistributionNPlus::new(&p); ecdna.proliferate_nplus(); assert!(ecdna.get_nplus_cells() > 0); @@ -683,22 +746,16 @@ mod tests { #[test] #[should_panic] - fn test_ecdna_distribution_dna_segregation_overflow() { - let p = Parameters { - init_copies: u16::MAX, - ..Default::default() - }; + fn ecdna_distribution_dna_segregation_overflow() { + let p = Parameters { init_copies: u16::MAX, ..Default::default() }; let ecdna = EcDNADistributionNPlus::new(&p); ecdna.dna_segregation(0usize); } #[test] - fn test_ecdna_distribution_dna_segregation() { + fn ecdna_distribution_dna_segregation() { let init_copies = 2u16; - let p = Parameters { - init_copies, - ..Default::default() - }; + let p = Parameters { init_copies, ..Default::default() }; let ecdna = EcDNADistributionNPlus::new(&p); let (ecdna1, ecdna2, uneven) = ecdna.dna_segregation(0usize); assert_eq!( @@ -716,97 +773,79 @@ mod tests { #[test] #[should_panic] - fn test_compute_mean_wrong_ntot() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn compute_mean_wrong_ntot() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; EcDNADistributionNPlus::new(&p).compute_mean(&0); } #[test] - fn test_compute_mean_empty() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; - assert!((EcDNADistributionNPlus::new(&p).compute_mean(&1) - 0f32).abs() < f32::EPSILON); + fn compute_mean_empty() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; + assert!( + (EcDNADistributionNPlus::new(&p).compute_mean(&1) - 0f32).abs() + < f32::EPSILON + ); } #[test] - fn test_compute_mean_empty_no_cells() { - let p = Parameters { - init_copies: 0u16, - ..Default::default() - }; + fn compute_mean_empty_no_cells() { + let p = Parameters { init_copies: 0u16, ..Default::default() }; assert!(EcDNADistributionNPlus::new(&p).compute_mean(&0).is_nan()); } #[test] #[should_panic] - fn test_compute_mean_no_cells() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; - assert_eq!(EcDNADistributionNPlus::new(&p).compute_mean(&0), f32::NAN); + fn compute_mean_no_cells() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; + assert!(EcDNADistributionNPlus::new(&p).compute_mean(&0).is_nan()); } #[test] - fn test_compute_mean_with_1s() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; - assert!((EcDNADistributionNPlus::new(&p).compute_mean(&1) - 1f32).abs() < f32::EPSILON); + fn compute_mean_with_1s() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; + assert!( + (EcDNADistributionNPlus::new(&p).compute_mean(&1) - 1f32).abs() + < f32::EPSILON + ); } #[test] - fn test_compute_mean_with_default() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; - assert!((EcDNADistributionNPlus::new(&p).compute_mean(&1) - 1f32).abs() < f32::EPSILON); + fn compute_mean_with_default() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; + assert!( + (EcDNADistributionNPlus::new(&p).compute_mean(&1) - 1f32).abs() + < f32::EPSILON + ); } #[test] - fn test_compute_mean_with_1s_and_nminus() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; - assert!((EcDNADistributionNPlus::new(&p).compute_mean(&2) - 0.5f32).abs() < f32::EPSILON); + fn compute_mean_with_1s_and_nminus() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; + assert!( + (EcDNADistributionNPlus::new(&p).compute_mean(&2) - 0.5f32).abs() + < f32::EPSILON + ); } #[test] - fn test_compute_mean_with_no_nminus() { - let p = Parameters { - init_copies: 2u16, - ..Default::default() - }; + fn compute_mean_with_no_nminus() { + let p = Parameters { init_copies: 2u16, ..Default::default() }; let mut distr = EcDNADistributionNPlus::new(&p); distr.0.push(4); assert!((distr.compute_mean(&2) - 3f32).abs() < f32::EPSILON) } #[test] - fn test_compute_mean_with_nminus() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn compute_mean_with_nminus() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut distr = EcDNADistributionNPlus::new(&p); distr.0.push(2); assert!((distr.compute_mean(&3) - 1f32).abs() < f32::EPSILON); } #[test] - fn test_compute_mean_overflow() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn compute_mean_overflow() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut distr = EcDNADistributionNPlus::new(&p); distr.0.push(u16::MAX); assert!(distr.get_nplus_cells() == 2u64); @@ -815,134 +854,12 @@ mod tests { } #[test] - fn test_compute_mean_overflow_nminus() { - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; + fn compute_mean_overflow_nminus() { + let p = Parameters { init_copies: 1u16, ..Default::default() }; let mut distr = EcDNADistributionNPlus::new(&p); distr.0.push(u16::MAX); assert!(distr.get_nplus_cells() == 2u64); let exp = ((1u64 + (u16::MAX as u64)) as f32) / 3f32; assert!((distr.compute_mean(&3) - exp).abs() < f32::EPSILON); } - - #[test] - fn test_from_vector() { - let original_data = vec![0u16, 2u16, 10u16]; - let ecdna = EcDNADistribution::from(original_data.clone()); - assert_eq!(ecdna.0, original_data); - } - - #[test] - fn test_entropy_higher_than_1() { - let original_data = EcDNADistribution::from(vec![1u16, 2u16, 10u16]); - let distr = EcDNADistributionNPlus::from(original_data); - assert!(distr.compute_entropy(&3) > 0f32); - } - - #[test] - fn test_entropy_0() { - let original_data = EcDNADistribution::from(vec![1u16, 1u16, 1u16, 1u16]); - let distr = EcDNADistributionNPlus::from(original_data); - assert!((distr.compute_entropy(&4) - 0f32).abs() < f32::EPSILON); - } - - #[test] - fn test_entropy_05() { - let original_data: EcDNADistribution = vec![1u16, 1u16, 2u16, 2u16].into(); - let distr = EcDNADistributionNPlus::from(original_data); - assert!((distr.compute_entropy(&4) - 1f32).abs() < f32::EPSILON); - } - - #[test] - fn test_from_vec_for_ecdna_distribution_empty() { - let my_vec = vec![]; - let dna = EcDNADistribution::from(my_vec); - assert!(dna.is_empty()); - } - - #[test] - fn test_from_vec_for_ecdna_distribution() { - let my_vec: EcDNADistribution = vec![1, 1, 2].into(); - let dna = EcDNADistributionNPlus::from(my_vec); - assert_eq!(dna.0, vec![1, 1, 2]); - assert_eq!(dna.get_nplus_cells(), 3); - } - - #[test] - fn test_from_vec_for_ecdna_distribution_zeros() { - let my_vec: EcDNADistribution = vec![1, 1, 2, 0, 0].into(); - let dna = EcDNADistributionNPlus::from(my_vec); - assert_eq!(dna.0, vec![1, 1, 2]); - assert_eq!(dna.get_nplus_cells(), 3); - } - - #[test] - fn test_compute_counts_empty() { - let empty = Vec::new(); - assert!(compute_counts(&empty).is_empty()) - } - - #[test] - fn test_compute_counts_1() { - let ones = vec![1u16, 1u16]; - let result = HashMap::from([(&1u16, 2u64)]); - assert_eq!(compute_counts(&ones), result); - } - - #[test] - fn test_compute_counts_1_2() { - let data = vec![1u16, 2u16]; - let result = HashMap::from([(&1u16, 1u64), (&2u16, 1u64)]); - assert_eq!(compute_counts(&data), result); - } - - #[test] - fn test_compute_counts() { - let data = vec![1u16, 2u16, 10u16]; - let result = HashMap::from([(&1u16, 1u64), (&2u16, 1u64), (&10u16, 1u64)]); - assert_eq!(compute_counts(&data), result); - } - - #[test] - fn test_state_from_ecdnanplus_to_ecdna_no_nminus() { - let expected = EcDNADistribution::from(vec![1, 0, 0]); - let p = Parameters { - init_copies: 1u16, - ..Default::default() - }; - - let ecdna_distr = EcDNADistributionNPlus::new(&p); - let state = State { - nminus: 2u64, - ecdna_distr, - event: Event { - kind: AdvanceRun::Proliferate1, - time: 2f32, - }, - }; - assert_eq!(state.ecdna_with_nminus_cells(), expected); - } - - #[test] - fn test_state_from_ecdnanplus_to_ecdna_nminus() { - let expected = EcDNADistribution::from(vec![2, 0, 0]); - let p = Parameters { - init_copies: 2u16, - init_nminus: 1u64, - ..Default::default() - }; - - let ecdna_distr = EcDNADistributionNPlus::new(&p); - let state = State { - nminus: 2u64, - ecdna_distr, - event: Event { - kind: AdvanceRun::Proliferate1, - time: 2f32, - }, - }; - assert_eq!(state.ecdna_with_nminus_cells(), expected); - } } diff --git a/ecdna-evo/src/simulation.rs b/ecdna-evo/src/simulation.rs index 469be09..e7c0a32 100644 --- a/ecdna-evo/src/simulation.rs +++ b/ecdna-evo/src/simulation.rs @@ -2,19 +2,17 @@ //! the ecDNA population dynamics, assuming an exponential growing population of //! tumour cells in a well-mixed environment. Simulation are carried out using //! the Gillespie algorithm. -use crate::dynamics::{Dynamic, Dynamics, Name}; +use crate::dynamics::{Dynamics, Name}; use crate::run::Run; use crate::{Parameters, Patient, Rates}; use anyhow::{anyhow, Context}; use chrono::Utc; -use enum_dispatch::enum_dispatch; use flate2::write::GzEncoder; use flate2::Compression; use indicatif::ParallelProgressIterator; use rayon::prelude::{IntoParallelIterator, ParallelIterator}; use std::env; use std::fs; -use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; /// Perform multiple independent simulations of tumour growth in parallel using @@ -36,8 +34,37 @@ impl<'sim, 'run> Simulation { //! must be `Some` //! //! 2. ABC: `patient` must be `Some` and quantities must be `None`. - // If we are in abc, `quantities` will be None (because created from - // the data) and `data` will be Some + Simulation::info(¶meters); + + // create path: relative path (used to creating the tarball) and + // absolute path, for dynamics and abc + let experiment = Simulation::create_path(¶meters, &rates); + let (abspath, relapath) = ( + env::current_dir() + .unwrap() + .join("results") + .join(experiment.clone()), + PathBuf::from("results").join(experiment), + ); + + (0..parameters.nb_runs) + .into_par_iter() + .progress_count(parameters.nb_runs as u64) + .for_each(|idx| { + Run::new(idx, parameters, &rates) + .simulate(dynamics.clone()) + .save(&abspath, &dynamics, &patient) + .unwrap() + }); + + println!("{} End simulating {} runs", Utc::now(), parameters.nb_runs); + println!("{} Results saved in {:#?}", Utc::now(), abspath); + + Simulation::tarballs(¶meters, patient, relapath, dynamics); + Ok(()) + } + + fn info(parameters: &Parameters) { println!( "{} Simulating with {} cores {} runs, each of them with {} cells and max iterations {}", Utc::now(), @@ -58,124 +85,144 @@ impl<'sim, 'run> Simulation { parameters.nb_runs ); } + } - // create path: relative path (used to creating the tarball) and - // absolute path, for dynamics and abc - let (relapath, abspath) = Simulation::create_path(¶meters, &rates); - let (relapath_d, relapath_abc) = (relapath.join("dynamics"), relapath.join("abc")); - - let saved = (0..parameters.nb_runs) - .into_par_iter() - .progress_count(parameters.nb_runs as u64) - .map(|idx| { - Run::new(idx, parameters, &rates) - .simulate(dynamics.clone()) - .save(&abspath, &dynamics, &patient) - .is_saved() - }) - .collect::>(); - - println!("{} End simulating {} runs", Utc::now(), parameters.nb_runs); - - println!( - "{} Start compressing {} runs", - Utc::now(), - parameters.nb_runs - ); - - // Save tarballs from abc even if there aren't any saved runs for abc - if patient.is_some() { + fn tarballs( + parameters: &Parameters, + patient: Option, + relapath: PathBuf, + dynamics: Option, + ) { + // ABC + if let Some(ref p) = patient { + let relapath_abc = relapath.join(p.name.clone()); if parameters.verbosity > 0 { println!("{} Creating tarball for abc", Utc::now()); } - Simulation::compress_results("all_rates", &relapath_abc, parameters.verbosity) - .expect("Cannot compress all rates"); - Simulation::compress_results("metadata", &relapath_abc, parameters.verbosity) - .expect("Cannot compress metadata"); + // subsamples: save the rates and values + if parameters.subsample.is_some() { + Simulation::tarball_samples( + parameters, + &patient, + &relapath_abc, + ) + } else { + Simulation::tarball(parameters, &patient, &relapath_abc, None) + } + } else { + // dynamics + if dynamics.is_some() { + assert!( + patient.is_none(), + "Cannot have patient and dynamics at the same time" + ); + let relapath_d = relapath.join("dynamics"); + return Simulation::tarball( + parameters, + &None, + &relapath_d, + dynamics, + ); + } + + //subsamples: save the ecdna, mean, frequency and entropy for + //each subsample + if parameters.subsample.is_some() { + Simulation::tarball_samples(parameters, &patient, &relapath); + } - Simulation::compress_results("values", &relapath_abc, parameters.verbosity) - .expect("Cannot compress metadata values"); + Simulation::tarball(parameters, &None, &relapath, None); } + } - // compress runs into tarball only if there is at least one saved run - if saved.iter().any(|&saved_run| saved_run) { - if let Some(dynamics) = &dynamics { - for d in dynamics.iter() { - if parameters.verbosity > 0 { - println!("{} Creating tarball for dynamics", Utc::now()); - } - Simulation::compress_results(d.get_name(), &relapath_d, parameters.verbosity) - .expect("Cannot compress dynamics"); - } - } + fn tarball_samples( + parameters: &Parameters, + patient: &Option, + relapath: &Path, + ) { + // subsamples extras + let path2multiple_samples = relapath.join("samples"); + Simulation::tarball(parameters, patient, &path2multiple_samples, None); + } - // data is some means abc subcomand, that is only the rates must - // be saved - if patient.is_some() { + fn tarball( + parameters: &Parameters, + patient: &Option, + parent_dir: &Path, + dynamics: Option, + ) { + if let Some(dynamics) = &dynamics { + for d in dynamics.iter() { if parameters.verbosity > 0 { - println!("{} Creating tarball for abc", Utc::now()); + println!("{} Creating tarball for dynamics", Utc::now()); } - Simulation::compress_results("rates", &relapath_abc, parameters.verbosity) - .expect("Cannot compress rates"); + Simulation::compress_results( + d.get_name(), + parent_dir, + parameters.verbosity, + ) + .expect("Cannot compress dynamics"); } + } + + if patient.is_some() { + Simulation::compress_results( + "abc", + parent_dir, + parameters.verbosity, + ) + .expect("Cannot compress abc results"); + } else { for name in ["ecdna", "mean", "frequency", "entropy"] { if parameters.verbosity > 0 { println!("{} Creating tarball", Utc::now()); } - // do not save the ecdna distr when there is a patient (abc) - match Simulation::compress_results(name, &relapath, parameters.verbosity) { - Ok(_) => {} - Err(_) => { - if !((name == "ecdna") && patient.is_some()) { - panic!("Cannot compress {}", name); - } - } - } + Simulation::compress_results( + name, + parent_dir, + parameters.verbosity, + ) + .unwrap(); } } - - println!("{} End compressing {} runs", Utc::now(), parameters.nb_runs); - - println!( - "{} {} runs over total of {} were saved", - Utc::now(), - saved.iter().filter(|&saved_run| *saved_run).count(), - parameters.nb_runs - ); - Ok(()) } - fn create_path(parameters: &Parameters, rates: &Rates) -> (PathBuf, PathBuf) { + fn create_path(parameters: &Parameters, rates: &Rates) -> PathBuf { //! Resturns the paths where to store the data. The hashmap has keys as //! the quantities stored, and values as the dest and source of where to //! compress the data. - // create path where to store the results - let relative_path = - PathBuf::from("results").join(Simulation::create_path_helper(parameters, rates)); - let abspath = env::current_dir().unwrap().join(&relative_path); - - (relative_path, abspath) - } - - fn create_path_helper(parameters: &Parameters, rates: &Rates) -> PathBuf { format!( - "{}runs_{}cells_{}_{}1_{}2", - parameters.nb_runs, parameters.max_cells, rates.fitness1, rates.death1, rates.death2 + "{}runs_{}cells_{}_{}1_{}2_{}undersample", + parameters.nb_runs, + parameters.max_cells, + rates.fitness1, + rates.death1, + rates.death2, + parameters.subsample.unwrap_or_default() ) .into() } - fn compress_results(kind: &str, basepath: &Path, verbosity: u8) -> anyhow::Result<()> { + fn compress_results( + kind: &str, + basepath: &Path, + verbosity: u8, + ) -> anyhow::Result<()> { let dest = basepath.join(kind); let src = env::current_dir().unwrap().join(&dest); - Simulation::compress_dir(&dest, &src, verbosity) - .with_context(|| format!("Cannot compress {:#?} into {:#?}", &src, &dest))?; + Simulation::compress_dir(&dest, &src, verbosity).with_context( + || format!("Cannot compress {:#?} into {:#?}", &src, &dest), + )?; Ok(()) } - fn compress_dir(dest_path: &Path, src_path_dir: &Path, verbosity: u8) -> anyhow::Result<()> { + fn compress_dir( + dest_path: &Path, + src_path_dir: &Path, + verbosity: u8, + ) -> anyhow::Result<()> { //! Compress the directory where all runs are saved into tarball at the //! same level of the saved runs. let mut dest_path_archive = dest_path.to_owned(); @@ -187,7 +234,12 @@ impl<'sim, 'run> Simulation { .write(true) .create_new(true) .open(&dest_path_archive) - .with_context(|| format!("Error while opening the stream {:#?}", &dest_path_archive))?; + .with_context(|| { + format!( + "Error while opening the stream {:#?}", + &dest_path_archive + ) + })?; let enc = GzEncoder::new(tar_gz, Compression::default()); let mut tar = tar::Builder::new(enc); // append recursively all the runs into the archive that is first created in the @@ -223,40 +275,6 @@ pub fn find_nan(val: f32) -> anyhow::Result { Ok(val) } -/// Trait to write the data to file -#[enum_dispatch] -pub trait ToFile { - fn save(&self, path2file: &Path) -> anyhow::Result<()>; -} - -pub fn write2file( - data: &[T], - path: &Path, - header: Option<&str>, -) -> anyhow::Result<()> { - //! Write vector of float into new file with a precision of 4 decimals. - //! Write NAN if the slice to write to file is empty. - fs::create_dir_all(path.parent().unwrap()).expect("Cannot create dir"); - let f = fs::OpenOptions::new() - .read(true) - .append(true) - .create(true) - .open(path)?; - let mut buffer = BufWriter::new(f); - if !data.is_empty() { - if let Some(h) = header { - writeln!(buffer, "{}", h)?; - } - write!(buffer, "{:.4}", data.first().unwrap())?; - for ele in data.iter().skip(1) { - write!(buffer, ",{:.4}", ele)?; - } - } else { - write!(buffer, "{}", f32::NAN)?; - } - Ok(()) -} - #[cfg(test)] mod tests { use super::*; diff --git a/ecdna/src/app.rs b/ecdna/src/app.rs index 2e9a80e..5e6d000 100644 --- a/ecdna/src/app.rs +++ b/ecdna/src/app.rs @@ -1,7 +1,8 @@ use crate::clap_app::clap_app; use clap::ArgMatches; use ecdna_evo::abc::PatientPathsBuilder; -use ecdna_evo::{DNACopy, Dynamic, Dynamics, NbIndividuals, Parameters, Patient, Rates}; +use ecdna_evo::dynamics::{Dynamic, Dynamics}; +use ecdna_evo::{DNACopy, NbIndividuals, Parameters, Patient, Rates}; pub fn build_app() -> (Parameters, Rates, Option, Option) { //! Build the app by parsing CL arguments with `clap` to build the structs @@ -9,17 +10,18 @@ pub fn build_app() -> (Parameters, Rates, Option, Option) { let matches = clap_app().get_matches(); // Global arguments first (valid for all subcomands) - let nb_runs: usize = matches.value_of_t("runs").unwrap_or_else(|e| e.exit()); - let init_copies: DNACopy = matches - .value_of_t("init_copies") - .unwrap_or_else(|e| e.exit()); - let init_nplus: NbIndividuals = matches - .value_of_t("init_nplus") - .unwrap_or_else(|e| e.exit()); - let init_nminus: NbIndividuals = matches - .value_of_t("init_nminus") - .unwrap_or_else(|e| e.exit()); - let max_cells: NbIndividuals = matches.value_of_t("max_cells").unwrap_or_else(|e| e.exit()); + let nb_runs: usize = + matches.value_of_t("runs").unwrap_or_else(|e| e.exit()); + let init_copies: DNACopy = + matches.value_of_t("init_copies").unwrap_or_else(|e| e.exit()); + let init_nplus: NbIndividuals = + matches.value_of_t("init_nplus").unwrap_or_else(|e| e.exit()); + let init_nminus: NbIndividuals = + matches.value_of_t("init_nminus").unwrap_or_else(|e| e.exit()); + let max_cells: NbIndividuals = + matches.value_of_t("max_cells").unwrap_or_else(|e| e.exit()); + let subsample: Option = + matches.value_of_t("undersample").ok(); let max_iter: usize = 3 * max_cells as usize; let verbosity = { match matches.occurrences_of("v") { @@ -29,21 +31,22 @@ pub fn build_app() -> (Parameters, Rates, Option, Option) { } }; - let parameters = Parameters { - nb_runs, - max_cells, - max_iter, - init_copies, - init_nplus: init_nplus != 0, - init_nminus, - verbosity, - init_iter: 0usize, - init_time: 0f32, - }; - // Then other arguments based on the subcomand used - let (d, rates, patient_data) = match matches.subcommand() { + let (d, rates, patient_data, parameters) = match matches.subcommand() { Some(("simulate", simulate_matches)) => { + let parameters = Parameters { + nb_runs, + max_cells, + max_iter, + init_copies, + init_nplus: init_nplus != 0, + init_nminus, + verbosity, + init_iter: 0usize, + init_time: 0f32, + subsample, + }; + // Quantities of interest that changes for each iteration let d = create_dynamics(simulate_matches, ¶meters); // Rates of the two-type stochastic birth-death process @@ -56,16 +59,29 @@ pub fn build_app() -> (Parameters, Rates, Option, Option) { } } - (d, r, None) + (d, r, None, parameters) } Some(("abc", abc_matches)) => { + let parameters = Parameters { + nb_runs, + max_cells, + max_iter, + init_copies, + init_nplus: init_nplus != 0, + init_nminus, + verbosity, + init_iter: 0usize, + init_time: 0f32, + subsample, + }; + // Rates of the two-type stochastic birth-death process let r = rates_from_args(abc_matches); - let patient = create_patient(abc_matches, parameters.verbosity); + let patient = create_patient(abc_matches, verbosity); // cannot have quantities with abc - (None, r, patient) + (None, r, patient, parameters) } _ => unreachable!(), }; @@ -74,7 +90,10 @@ pub fn build_app() -> (Parameters, Rates, Option, Option) { (parameters, rates, d, patient_data) } -pub fn create_dynamics(matches: &ArgMatches, parameters: &Parameters) -> Option { +pub fn create_dynamics( + matches: &ArgMatches, + parameters: &Parameters, +) -> Option { //! Create dynamics parsing the CL arguments. Modify this function when //! adding new type of `Dynamic`. if parameters.verbosity > 1 { @@ -131,42 +150,38 @@ pub fn create_patient(matches: &ArgMatches, verbosity: u8) -> Option { paths.entropy(e); } - if let Some(_exp) = matches.value_of("exp_input") { - if verbosity > 1 { - println!("Add exponential to paths"); - } - todo!() - // paths.exponential(exp); - } + // if let Some(_exp) = matches.value_of("exp_input") { + // if verbosity > 1 { + // println!("Add exponential to paths"); + // } + // todo!() + // } if verbosity > 0 { println!("Paths to patient's input data: {:#?}", paths); } - Some(Patient::load(paths.build().unwrap(), verbosity)) + let name: String = matches.value_of_t("name").unwrap_or_else(|e| e.exit()); + let patient = Patient::load(paths.build().unwrap(), &name, verbosity); + if verbosity > 0 { + println!("Patient: {:#?}", patient); + } + + Some(patient) } fn rates_from_args(matches: &ArgMatches) -> Rates { // Proliferation rate of the cells w/ ecDNA - let fitness1: Vec = matches.values_of_t("selection").unwrap_or_else(|_| { - matches - .values_of_t("selection_range") - .unwrap_or_else(|e| e.exit()) - }); + let fitness1: Vec = + matches.values_of_t("selection").unwrap_or_else(|e| e.exit()); // Proliferation rate of the cells w/o ecDNA let fitness2 = vec![1f32]; // Death rate of cells w/ ecDNA - let death1: Vec = matches.values_of_t("death1").unwrap_or_else(|_| { - matches - .values_of_t("death1_range") - .unwrap_or_else(|e| e.exit()) - }); - let death2: Vec = matches.values_of_t("death2").unwrap_or_else(|_| { - matches - .values_of_t("death2_range") - .unwrap_or_else(|e| e.exit()) - }); + let death1: Vec = + matches.values_of_t("death1").unwrap_or_else(|e| e.exit()); + let death2: Vec = + matches.values_of_t("death2").unwrap_or_else(|e| e.exit()); Rates::new(&fitness1, &fitness2, &death1, &death2) } diff --git a/ecdna/src/clap_app.rs b/ecdna/src/clap_app.rs index cacf5f9..8f218ea 100644 --- a/ecdna/src/clap_app.rs +++ b/ecdna/src/clap_app.rs @@ -44,7 +44,16 @@ pub fn clap_app() -> App<'static> { .global(true) .takes_value(true), ) + .arg(Arg::new("undersample") + .long("undersample") + .short('u') + .help("Specify how many cells to consider while subsampling the ecDNA distribution") + .takes_value(true) + .global(true) + .validator(|cells| { if cells.parse::().unwrap() > 10000_usize { return Err(String::from("The maximal number of cells allowed to perform the ecDNA distribution subsampling must be smaller than 10001")) } Ok(()) } ), + ) .arg(Arg::new("v") + .long("verbosity") .short('v') .max_occurrences(3) .global(true) @@ -85,21 +94,27 @@ pub fn clap_app() -> App<'static> { ) .subcommand(App::new("abc") .about("Infer the most probable fitness and death coefficients from the real data using approximate Bayesian computation (ABC)") - .arg(Arg::new("selection_range") + .arg(Arg::new("name") + .long("name") + .required(true) + .help("Name of the experiment where to save the results") + .takes_value(true), + ) + .arg(Arg::new("selection") .long("selection-range") .max_values(2) .required(true) .help("If one value given, `abc` would not optimize over fitness hence simulations will have fixed fitness parameter. If two values are given, they represent the minimal and maximal value of fitness to test; 1 corresponds to neutral evolution") .takes_value(true), ) - .arg(Arg::new("death1_range") + .arg(Arg::new("death1") .long("death1-range") .max_values(2) .required(true) .help("If one value given, `abc` would not optimize over death1 hence simulations will have fixed death1 parameter. If two values are given, they represent the minimal and maximal value of death rate for the NPlus cells w/ ecdna") .takes_value(true), ) - .arg(Arg::new("death2_range") + .arg(Arg::new("death2") .long("death2-range") .max_values(2) .required(true) @@ -107,32 +122,32 @@ pub fn clap_app() -> App<'static> { .takes_value(true), ) .arg(Arg::new("distribution_input") - .long("distribution-input") + .long("distribution") .value_name("FILE") .help("Distribution of the copy number ecDNA of the patient, input FILE") .takes_value(true) .required(false), ) .arg(Arg::new("mean_input") - .long("mean-input") + .long("mean") .value_name("FILE") .help("Mean copy number ecDNA of the patient, input FILE") .takes_value(true) - .required(false), + .required_unless_present_any(&["distribution_input", "frequency_input", "entropy_input"]), ) .arg(Arg::new("frequency_input") - .long("frequency-input") + .long("frequency") .value_name("FILE") .help("Frequency of ecDNA carrying cells of the patient, input FILE") .takes_value(true) - .required(false), + .required_unless_present_any(&["distribution_input", "mean_input", "entropy_input"]), ) .arg(Arg::new("entropy_input") - .long("entropy-input") + .long("entropy") .value_name("FILE") .help("Entropy of ecDNA distribution considering cells w/o any ecDNA copies, input FILE") .takes_value(true) - .required(false), - ) + .required_unless_present_any(&["distribution_input", "mean_input", "frequency_input"]), + ), ) } diff --git a/ecdnaevo/app.py b/ecdnaevo/app.py index 6dc35f8..948749f 100644 --- a/ecdnaevo/app.py +++ b/ecdnaevo/app.py @@ -2,90 +2,137 @@ # coding: utf-8 import argparse +import pandas as pd +from collections import UserDict +from itertools import repeat from pathlib import Path +from dataclasses import dataclass +from ecdnaevo import plots -def build_app() -> (Path, Path, Path, int): - """Parse and returns the paths to: - 1. rates.tar.gz: the tarball where the proliferation and death rates are - stored only for the ABC runs that match the patient's data - 2. all_rates.tar.gz: same as above, but for all runs (not only the - matching ones) - 3. values.tar.gz: the tarball where the distances between the patient's - data and the runs are stored, for all runs - Returns also an integer representing the percentage for the threshold for - the runs accepted (e.g. 5 means plot rates for runs that have - a relative distance, stored in values.tar.gz, smaller or equal than 5%) +@dataclass +class App: """ + paths: MyPaths + thresholds: dict with the statistics as keys ("ecdna", "mean", "frequency", "entropy") and the values are the thresholds. A threshold indicate the minimal required difference between the run and the patient's data to accept the run in ABC. The thresholds for "mean", "frequency" and "entropy" are relative differences (abs(x - x_sim) / x), whereas the ks distance of the ecDNA distribution is absolute. + stats: plot several subplots for combinations of statistics + """ + + abc: Path + thresholds: plots.Thresholds + path2save: Path + stats: bool + verbosity: bool + + +def load(path2abc: Path, verbosity: bool) -> pd.DataFrame: + abc = pd.read_csv(path2abc, header=0, low_memory=False) + abc.drop(abc[abc.idx == "idx"].index, inplace=True) + abc.dropna(how="all", inplace=True) + abc.loc[:, "idx"] = abc.idx.astype("uint32") + # print(abc.loc[abc[abc.columns[0]].isna().index, :]) + # abc.loc[:, abc.columns[0]] = abc[abc.columns[0]].astype("uint32") + for col in abc.columns[2:]: + abc[col] = abc[col].astype("float") + if verbosity: + print(abc.head()) + return abc + + +def create_path2save(path2dir: Path, filename: Path) -> Path: + assert path2dir.is_dir() + path = path2dir / filename + path.touch(exist_ok=False) + return path + + +def run(app: App): + """Plot posterior distribution of the fitness coefficient""" + abc = load(app.abc, app.verbosity) + + if app.stats: + path2save = create_path2save( + app.path2save, + Path( + "{}relative_{}ecdna_fitness_raw_subplots.pdf".format( + app.thresholds["mean"], app.thresholds["ecdna"] + ) + ), + ) + plots.plot_posterior_per_stats(app.thresholds, abc, path2save) + else: + path2save = create_path2save( + app.path2save, + Path( + "{}relative_{}ecdna_fitness_raw.pdf".format( + app.thresholds["mean"], app.thresholds["ecdna"] + ) + ), + ) + plots.plot_post(app.thresholds, abc, path2save) + + +def build_app() -> App: + """Parse and returns the app""" # create the top-level parser parser = argparse.ArgumentParser( description="Plot posterior distribution (histograms) for ABC inference." ) + parser.add_argument( - "path2rates", - action="store", - metavar="FILE", - type=str, - help="Path to tarball file rates.tar.gz", + "--stats", + dest="stats", + required=False, + action="store_true", + default=False, + help="Plot posterior for combinations of statistics", ) - subparsers = parser.add_subparsers(title="subcomand") - - # create the parser for the "stats" command - parser_stats = subparsers.add_parser( - "stats", - help="Plot posterior distribution of the fitness coefficient for all combinations of statistics given a relative threshold", + parser.add_argument( + "threshold_relative", + type=int, + help="Integer specifying the relative difference percentage threshold for which the posterior will be plotted. This will will be used for the mean, frequency and the entropy not for the ecDNA distribution.", ) - parser_stats.add_argument( - "threshold", - type=int, - help="integer specifying the relative difference percentage threshold for which the posterior will be plotted", + parser.add_argument( + "threshold_ecdna", + type=float, + help="Float specifying the distance used to accept runs based on the ks distance of the ecDNA distribution", ) - parser_stats.add_argument( - "--all-rates", + parser.add_argument( + "--abc", metavar="FILE", - dest="path2all_rates", + dest="path2abc", required=True, type=str, - help="Path to tarball file all_rates.tar.gz", + help="Path to tarball file abc.tar.gz, where the output of the ABC inference can be found (csv files for each run)", ) - parser_stats.add_argument( - "--values", - dest="path2values", - metavar="FILE", - type=str, - required=True, - help="Path to tarball file values.tar.gz", + parser.add_argument( + "-v", + "--verbosity", + action="store_true", + default=False, + help="increase output verbosity", ) - # TODO remove? - # parser_stats.add_argument( - # "--metadata", - # dest="path2metadata", - # metavar="FILE", - # type=str, - # required=False, - # help="Path to tarball file metadata.tar.gz", - # ) - args = vars(parser.parse_args()) - try: - path2all_rates = Path(args["path2all_rates"]) - except KeyError: - return Path(args["path2rates"]), None, None, None - - try: - path2values = Path(args["path2values"]) - except KeyError: - raise parser.error("When `stats` is present, `--values` is required") - - try: - threshold = int(args["threshold"]) - except KeyError: - raise parser.error("When `stats` is present, the threshold must be specified") + thresholds = UserDict( + { + k: float(val / 100) + for (k, val) in zip( + {"mean", "frequency", "entropy"}, + repeat(int(args["threshold_relative"])), + ) + } + ) + thresholds["ecdna"] = float(args["threshold_ecdna"]) + abc = Path(args["path2abc"]) + stats = args["stats"] + assert abc.parent.is_dir() + # all thresholds are the same except for ecdna distribution + assert len({th for (k, th) in thresholds.items() if k != "ecdna"}) == 1 - return Path(args["path2rates"]), path2all_rates, path2values, threshold + return App(abc, plots.Thresholds(thresholds), abc.parent, stats, args["verbosity"]) diff --git a/ecdnaevo/load.py b/ecdnaevo/load.py deleted file mode 100644 index dc9ae65..0000000 --- a/ecdnaevo/load.py +++ /dev/null @@ -1,90 +0,0 @@ -import re -import tarfile -from pathlib import Path, PurePath -import pandas as pd - - -def find_metadata_files(path: Path) -> [Path]: - """From parent directory `path`, find all metdata tarballs in all subdirectories""" - try: - return find_files(path, "metadata.tar.gz") - except AssertionError as e: - raise ValueError(e) - - -def find_all_rates_files(path: Path) -> [Path]: - """From parent directory `path`, find all all_rates tarballs in all subdirectories""" - try: - return find_files(path, "all_rates.tar.gz") - except AssertionError as e: - raise ValueError(e) - - -def find_files(path: Path, filename: str) -> [Path]: - assert path.is_dir(), "{} is not a dir".format(path) - return [Path(adir) / filename for adir in path.iterdir()] - - -def infer_runs(path2file: str) -> int: - pattern_runs = re.compile(r"(\d+)runs_") - try: - return re.search(pattern_runs, path2file).group(1) - except AttributeError: - raise ValueError("Cannot infer the nb of runs from file '{}'".format(path2file)) - - -def infer_ground_truth_idx(path2file: str) -> int: - """From path2file extract the integer representing the idx of the ground truth run""" - pattern_runs = re.compile(r"_(\d+)idx") - processed_path = PurePath(path2file).parent.stem - try: - return int(re.search(pattern_runs, processed_path).group(1)) - except AttributeError: - raise ValueError( - "Cannot infer the idx of the ground truth from file '{}'".format(path2file) - ) - - -def load(path2data: Path, dtype: str): - """Create a dataframe with idx representing the runs idx of the ground truth used and the entries can be anything. Returns also the ground truth idx that is found in `path2data`. - `path2data` must end with _Xidx where X is any int - """ - return extract_tarball(path2data, dtype, True) - - -def extract_tarball(path2data: Path, dtype: str, header: bool = False): - """Create dataframe from tarball in `path2data`, where each member is a file with or without any `header`. All files (members) must have the same format (nb columns) and must only have at max 2 rows (in this case `header` must be true).""" - idx, data = [], [] - with tarfile.open(path2data, "r:gz") as tar: - for i, member in enumerate(tar.getmembers()): - # name of the member (filename) - - if member.isfile(): - run_idx = int(Path(member.name).stem) - idx.append(run_idx) - - f = tar.extractfile(member) - lines = f.readlines() - if header: - data.append( - { - k: float(val) - for (k, val) in zip( - lines[0].decode().strip("\n").split(","), - lines[1].decode().strip("\n").split(","), - ) - } - ) - else: - data.append(lines[0].decode().strip("\n").split(",")) - - print("Loaded {} files".format(i)) - idx = pd.Series(idx, dtype=int) - data = pd.DataFrame.from_records(data, index=idx).astype(dtype=dtype) - data.index.rename("run", inplace=True) - try: - gr_truth = infer_ground_truth_idx(path2data) - except ValueError: - gr_truth = None - - return data, gr_truth diff --git a/ecdnaevo/plots.py b/ecdnaevo/plots.py index 62c0d70..57c8be2 100644 --- a/ecdnaevo/plots.py +++ b/ecdnaevo/plots.py @@ -1,10 +1,16 @@ #!/usr/bin/env python # coding: utf-8 -from ecdnaevo import load +from typing import Tuple import matplotlib.pyplot as plt import numpy as np +from collections import UserDict +from typing import NewType from itertools import combinations -from pathlib import Path + +PLOTTING_STATS = {"ecdna": "D", "mean": "M", "frequency": "F", "entropy": "E"} +MYSHAPE = (len(PLOTTING_STATS), len(PLOTTING_STATS)) +# {"ecdna": float, "mean": float, "frequency", float, "entropy": float} +Thresholds = NewType("Thresholds", UserDict[str, float]) def accuracy(df, stats: list): @@ -22,98 +28,53 @@ def accuracy(df, stats: list): def plot_fitness(fitness, ax, title=None): - plot_rates(fitness, (1, 3), 20, ax, title) + plot_rates(fitness, (0.9, 3.1), 120, ax, title) def plot_death1(death1, ax, title=None): - plot_rates(death1, (0, 1), 20, ax, title) + plot_rates(death1, (0, 1.1), 40, ax, title) def plot_death2(death2, ax, title=None): - # path2save = load.path2save_from_rates(path2rates, "death2", title) - plot_rates(death2, (0, 1), 20, ax, title) + plot_rates(death2, (0, 1.1), 40, ax, title) -def plot_rates(rates, range_hist: (float, float), bins: int, ax, title=None): +def plot_rates(rates, range_hist: Tuple[float, float], bins: int, ax, title=None): """Modify inplace the ax by plotting the hist of the posterior distribution `rates`. Here `title` is the title of the axis `ax`.""" ax.hist(rates, bins=bins, range=range_hist, align="mid") ax.set_title(title) -def plot_posterior_per_stats(values, all_rates, thresholds, path2save_dir: Path): +def plot_posterior_per_stats(thresholds: Thresholds, abc, path2save): """Plot the posterior distribution for the fitness coefficient for each combination of statistics. - values: DataFrame with the runs idx as idx and the statistics as columns. + abc: DataFrame The entries are the values of the distance between the patient's data and the run, for each run. Distance is the (for now) relative difference - for mean, frequency, and entropy (abs(x - x_sim) / x), and for the - ecdna is the ks distance. - all_rates: DataFrame with runs idx as idx and the rates (f1, f2, d1, d2) - as columns. The entries are the values of the rates for each run. - thresholds: Dict with statistics as keys and values thresholds. The keys - must be the same as the columns of values - + for mean, frequency, and entropy, and for the ecdna is the ks distance. """ - assert not set(values.index) - set( - all_rates.index - ), "Found different idx in values and all_rates" - assert not set(thresholds.keys()) - set( - values.columns - ), "Thresholds keys do not match values columns" - assert path2save_dir.is_dir(), "`path2save` must be a directory" - - # assume all thresholds are the same except for ecdna distribution - relative_threshold = thresholds["mean"] - my_query = "" - stats = [] - - path2save_subplots = path2save_dir / "statistics_subplots" - if not path2save_subplots.exists(): - path2save_subplots.mkdir() - - myshape = (4, 4) - fig_tot, axes = plt.subplots(*myshape) - path2save_tot = path2save_dir / str( - "{}relative_fitness_raw_subplots".format(relative_threshold) + ".pdf" - ) + fig_tot, axes = plt.subplots(*MYSHAPE, sharex=True) + axes[0, 0].set_xlim([0.9, 3.1]) i = 0 - plotting_stats = {"ecdna": "D", "mean": "M", "frequency": "F", "entropy": "E"} # combinations of statistics for r in range(1, 5): # iter over the combinations for the_thresholds in combinations(thresholds.items(), r): # iter over the (stats, threshold) for each combination - for threshold in the_thresholds: - my_query += "({} < {}) & ".format(*threshold) - stats.append(threshold[0]) - my_query = my_query.rstrip(" & ") - print(my_query) - to_plot = all_rates.loc[values[stats].query(my_query).index].f1 - path2save = path2save_subplots / str( - "{}relative_fitness_raw_".format(relative_threshold) - + "_".join(list(stats)) - + ".pdf" - ) - - fig, ax = plt.subplots(1, 1) - plot_fitness(to_plot, ax, list(stats)) - fig.axes.append(ax) - fig.savefig(fname=path2save) - - ax = axes[np.unravel_index(i, myshape)] - plot_fitness(to_plot, ax, list(stats)) + ax = axes[np.unravel_index(i, MYSHAPE)] + plot_posterior(the_thresholds, abc, ax) clean = ",".join( [ - plotting_stats[stat] + PLOTTING_STATS[stat] for stat in "".join( [e for e in ax.get_title() if e not in {"[", "]", "'", " "}] ).split(",") ] ) - ax.set_title(clean, {"fontsize": 10}) + ax.tick_params(axis="both", size=2, which="major", labelsize=4) ax.tick_params( axis="both", size=2, @@ -123,64 +84,40 @@ def plot_posterior_per_stats(values, all_rates, thresholds, path2save_dir: Path) fig_tot.axes.append(ax) i += 1 - my_query = "" - stats = [] # last axis - ax = axes[np.unravel_index(i, myshape)] - ax.tick_params(axis="both", size=2, which="major", labelsize=4) + ax = axes[np.unravel_index(i, MYSHAPE)] + ax.axis("off") fig_tot.axes.append(ax) fig_tot.subplots_adjust(hspace=0.5) - fig_tot.savefig(fname=path2save_tot, bbox_inches="tight") + print("Saving figure to", path2save) + fig_tot.savefig(fname=path2save, bbox_inches="tight") + plt.close(fig_tot) + + +def plot_posterior(thresholds, abc, ax): + my_query = "" + stats = [] + for threshold in thresholds: + my_query += "(`{}` < {}) & ".format(*threshold) + stats.append(threshold[0]) + my_query = my_query.rstrip(" & ") + print(my_query) + to_plot = abc.loc[abc[stats].query(my_query).index].f1 + plot_fitness(to_plot, ax, list(stats)) + + +def plot_post(thresholds: Thresholds, abc, path2save): + fig, ax = plt.subplots(1, 1) + plot_posterior(thresholds.items(), abc, ax) + # ax.tick_params(axis="both", size=2, which="major", labelsize=4) + fig.axes.append(ax) + fig.subplots_adjust(hspace=0.5) + print("Saving figure to", path2save) + fig.savefig(fname=path2save, bbox_inches="tight") + plt.close(fig) if __name__ == "__main__": from ecdnaevo import app - path2rates, path2all_rates, path2values, threshold = app.build_app() - - # load ABC results - (rates, _) = load.extract_tarball(path2rates, float, False) - try: - (all_rates, _) = load.extract_tarball(path2all_rates, float, True) - except ValueError: - assert path2all_rates is None - try: - (values, _) = load.extract_tarball(path2values, float, True) - except ValueError: - assert path2all_rates is None - - # Plots - if path2values is not None: - if path2all_rates is not None: - relative_thr = threshold / 100 - thresholds = { - "ecdna": 0.02, - "entropy": relative_thr, - "mean": relative_thr, - "frequency": relative_thr, - } - plot_posterior_per_stats(values, all_rates, thresholds, path2rates.parent) - else: - raise ValueError( - "When path2values is set, path2all_rates should be set as well, found None" - ) - else: - path = Path(path2rates.parent / "fitness_raw.pdf") - print("Plotting fitness_raw and saving in", path) - fig, ax = plt.subplots(1, 1) - plt.savefig(fname=path) - plot_fitness(rates[0], ax, "fitness_raw") - fig.axes.append(ax) - fig.savefig(fname=path) - - path = path2rates.parent / "death1.pdf" - print("Plotting death1 and save in", path) - fig, ax = plt.subplots(1, 1) - plot_death1(rates[2], ax, "death1") - fig.savefig(fname=path) - - path = path2rates.parent / "death2.pdf" - print("Plotting death2 and saving in", path) - fig, ax = plt.subplots(1, 1) - plot_death2(rates[3], ax, "death2") - fig.savefig(fname=path) + app.run(app.build_app()) diff --git a/requirements.txt b/requirements.txt index 4a8880f..91699f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,10 @@ pandas==1.3.5 python-dateutil==2.8.2 pytz==2021.3 six==1.16.0 +cycler==0.11.0 +fonttools==4.29.1 +kiwisolver==1.3.2 +matplotlib==3.5.1 +packaging==21.3 +Pillow==9.0.1 +pyparsing==3.0.7 diff --git a/results/README.md b/results/README.md deleted file mode 100644 index f9bdc21..0000000 --- a/results/README.md +++ /dev/null @@ -1 +0,0 @@ -Folder where to store the results of the simulations. diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..aa37a21 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max"