Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

user-selectable error k-mer percentile #506

Merged
merged 1 commit into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion metagraph/src/cli/clean.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ int clean_graph(Config *config) {

uint64_t cutoff
= estimate_min_kmer_abundance(*_graph, *node_weights,
config->num_singleton_kmers);
config->num_singleton_kmers,
config->cleaning_threshold_percentile);

if (cutoff != static_cast<uint64_t>(-1)) {
config->min_unitig_median_kmer_abundance = cutoff;
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,8 @@ Config::Config(int argc, char *argv[]) {
min_tip_size = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--prune-unitigs")) {
min_unitig_median_kmer_abundance = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--cleaning-threshold-percentile")) {
fallback_abundance_cutoff = std::stod(get_value(i++));
} else if (!strcmp(argv[i], "--fallback")) {
fallback_abundance_cutoff = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--smoothing-window")) {
Expand Down Expand Up @@ -1000,6 +1002,7 @@ if (advanced) {
fprintf(stderr, "\t --prune-tips [INT] \t\tprune all dead ends shorter than this value [1]\n");
fprintf(stderr, "\t --prune-unitigs [INT] \tprune all unitigs with median k-mer counts smaller\n"
"\t \t\tthan this value (0: auto) [1]\n");
fprintf(stderr, "\t --cleaning-theshold-percentile [FLOAT] the percentile of the k-mer count distribution to set as the cleaning threshold [0.001]\n");
fprintf(stderr, "\t --fallback [INT] \t\tfallback threshold if the automatic one cannot be\n"
"\t \t\tdetermined (-1: disables fallback) [1]\n");
fprintf(stderr, "\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class Config {
double alignment_min_exact_match = 0.7;
double min_fraction = 0.0;
double max_fraction = 1.0;
double cleaning_threshold_percentile = 0.001;
std::vector<double> count_slice_quantiles;
std::vector<double> count_quantiles;

Expand Down
9 changes: 7 additions & 2 deletions metagraph/src/graph/graph_cleaning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ bool is_unreliable_unitig(const std::vector<SequenceGraph::node_index> &path,


int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
double fdr_thres,
double *alpha_est_ptr, double *beta_est_ptr,
double *false_pos_ptr, double *false_neg_ptr);

// returns -1 if the automatic estimation fails
uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,
const NodeWeights &node_weights,
uint64_t num_singleton_kmers) {
uint64_t num_singleton_kmers,
double cleaning_threshold_percentile) {
std::vector<uint64_t> hist;
graph.call_nodes([&](auto i) {
uint64_t kmer_count = node_weights[i];
Expand All @@ -60,6 +62,7 @@ uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,

double alpha_est_ptr, beta_est_ptr, false_pos_ptr, false_neg_ptr;
return cleaning_pick_kmer_threshold(hist.data(), hist.size(),
cleaning_threshold_percentile,
&alpha_est_ptr, &beta_est_ptr,
&false_pos_ptr, &false_neg_ptr);
}
Expand Down Expand Up @@ -201,11 +204,13 @@ static inline bool is_cutoff_good(const uint64_t *kmer_covg, size_t arrlen,
*
* @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1
* @param arrlen Length of array kmer_covg
* @param fdr_thes Percentile of the kmer coverage histogram to use as the error threshold
* @param alpha_est_ptr If not NULL, used to return estimate for alpha
* @param beta_est_ptr If not NULL, used to return estimate for beta
* @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff
*/
int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
double fdr_thres,
double *alpha_est_ptr, double *beta_est_ptr,
double *false_pos_ptr, double *false_neg_ptr)
{
Expand Down Expand Up @@ -285,7 +290,7 @@ int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,

// Find cutoff by finding first coverage level where errors make up less than
// 0.1% of total coverage
cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, 0.001);
cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, fdr_thres);
// printf("A cutoff: %i\n", cutoff);

// Pick highest cutoff that keeps FP < FN
Expand Down
3 changes: 2 additions & 1 deletion metagraph/src/graph/graph_cleaning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ bool is_unreliable_unitig(const std::vector<SequenceGraph::node_index> &path,

uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,
const NodeWeights &node_weights,
uint64_t num_singleton_kmers = 0);
uint64_t num_singleton_kmers = 0,
double cleaning_threshold_percentile = 0.001);

} // namespace graph
} // namespace mtg
Expand Down
Loading