Skip to content

Commit

Permalink
Add ambiguity expansion iterator (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
swooster authored Oct 17, 2023
1 parent cb2ee54 commit de251c7
Show file tree
Hide file tree
Showing 5 changed files with 459 additions and 0 deletions.
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,8 @@ default = ["python-support"]
[[bench]]
name = "all_windows"
harness = false


[[bench]]
name = "expansions"
harness = false
153 changes: 153 additions & 0 deletions benches/expansions.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright 2021-2023 SecureDNA Stiftung (SecureDNA Foundation) <[email protected]>
// SPDX-License-Identifier: MIT OR Apache-2.0

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use quickdna::{
expansions::Expansions, BaseSequence, DnaSequenceStrict, Nucleotide, NucleotideAmbiguous,
};
use rand::{rngs::OsRng, seq::SliceRandom};

// Vec-based expansions code taken from DNA screening tools

const MAX_EXPANSIONS_FOR_WINDOW: usize = 16;

fn vec_based_expansions<const WINDOW_SIZE: usize>(
wildcards: [NucleotideAmbiguous; WINDOW_SIZE],
) -> Vec<[Nucleotide; WINDOW_SIZE]> {
if !are_expansions_feasible(wildcards) {
panic!(
"Too many possible expansions of window {:?}; ignoring window",
wildcards
);
}

let mut expanded_sequence_buffers: Vec<DnaSequenceStrict> =
vec![DnaSequenceStrict::new(vec![])];

for wildcard in wildcards {
let expansions = wildcard.possibilities();
if expansions.len() == 1 {
for sequence_buffer in expanded_sequence_buffers.iter_mut() {
sequence_buffer.push(expansions[0])
}
} else {
let mut new_buffers = Vec::new();
for nucleotide_from_expansion in expansions.iter() {
for existing_sequence_buffer in expanded_sequence_buffers.iter() {
let mut new_sequence_bufer = existing_sequence_buffer.clone();
new_sequence_bufer.push(*nucleotide_from_expansion);
new_buffers.push(new_sequence_bufer);
}
}
expanded_sequence_buffers = new_buffers;
}
}

expanded_sequence_buffers
.into_iter()
.map(|s| s.as_slice().try_into().unwrap())
.collect()
}

/// each expansion creates n variants
/// for example AAR becomes [AAA, AAG]
/// these expansions are exponential
/// AARR becomes [AAAA, AAAG, AAGA, AAGG] and so on
/// we do not want to generate more than a limited number of these
fn are_expansions_feasible(ws: impl IntoIterator<Item = NucleotideAmbiguous>) -> bool {
let mut acc = 1;
for w in ws {
acc *= w.possibilities().len();
if acc > MAX_EXPANSIONS_FOR_WINDOW {
return false;
}
}
acc <= MAX_EXPANSIONS_FOR_WINDOW
}

fn semi_ambiguous_dna(dna_len: usize, num_ambiguities: usize) -> Vec<NucleotideAmbiguous> {
let nucleotides = Nucleotide::ALL.map(|nuc| nuc.into());
let ambiguous_nucleotides = [
NucleotideAmbiguous::W,
NucleotideAmbiguous::M,
NucleotideAmbiguous::Y,
NucleotideAmbiguous::H,
NucleotideAmbiguous::R,
NucleotideAmbiguous::K,
NucleotideAmbiguous::D,
NucleotideAmbiguous::S,
NucleotideAmbiguous::V,
NucleotideAmbiguous::B,
NucleotideAmbiguous::N,
];
let mut dna: Vec<_> = (0..dna_len)
.map(|_| *nucleotides.choose(&mut OsRng).unwrap())
.collect();
for i in rand::seq::index::sample(&mut OsRng, dna_len, num_ambiguities) {
dna[i] = *ambiguous_nucleotides.choose(&mut OsRng).unwrap();
}
dna
}

pub fn criterion_benchmark(c: &mut Criterion) {
let num_windows = 1000;
let num_ambiguities = 2;
const WINDOW_LEN: usize = 42;
let windows: Vec<[_; WINDOW_LEN]> = (0..num_windows)
.map(|_| {
semi_ambiguous_dna(WINDOW_LEN, num_ambiguities)
.try_into()
.unwrap()
})
.collect();

let num_windows_desc = format!("{num_windows} windows");

let mut group = c.benchmark_group("Expansion window content");
group.throughput(Throughput::Elements(num_windows));
group.bench_with_input(
BenchmarkId::new("vec-based", &num_windows_desc),
&windows,
|b, windows| {
b.iter(|| {
for window in windows {
for expansion in vec_based_expansions(*window) {
black_box(expansion);
}
}
})
},
);
group.bench_with_input(
BenchmarkId::new("iter-based", &num_windows_desc),
&windows,
|b, windows| {
b.iter(|| {
for window in windows {
for expansion in Expansions::new(window) {
black_box(expansion);
}
}
})
},
);
group.finish();

let mut group = c.benchmark_group("Expansion window size-hint");
group.throughput(Throughput::Elements(num_windows));
group.bench_with_input(
BenchmarkId::new("iter-based", &num_windows_desc),
&windows,
|b, windows| {
b.iter(|| {
for window in windows {
black_box(Expansions::new(window).size_hint());
}
})
},
);
group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Loading

0 comments on commit de251c7

Please sign in to comment.