Skip to content

Commit

Permalink
user-selectable error k-mer percentile (#506)
Browse files Browse the repository at this point in the history
  • Loading branch information
hmusta authored Oct 25, 2024
1 parent 5af4aca commit 70fb7ff
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 4 deletions.
3 changes: 2 additions & 1 deletion metagraph/src/cli/clean.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ int clean_graph(Config *config) {

uint64_t cutoff
= estimate_min_kmer_abundance(*_graph, *node_weights,
config->num_singleton_kmers);
config->num_singleton_kmers,
config->cleaning_threshold_percentile);

if (cutoff != static_cast<uint64_t>(-1)) {
config->min_unitig_median_kmer_abundance = cutoff;
Expand Down
3 changes: 3 additions & 0 deletions metagraph/src/cli/config/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,8 @@ Config::Config(int argc, char *argv[]) {
min_tip_size = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--prune-unitigs")) {
min_unitig_median_kmer_abundance = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--cleaning-threshold-percentile")) {
fallback_abundance_cutoff = std::stod(get_value(i++));
} else if (!strcmp(argv[i], "--fallback")) {
fallback_abundance_cutoff = atoi(get_value(i++));
} else if (!strcmp(argv[i], "--smoothing-window")) {
Expand Down Expand Up @@ -1000,6 +1002,7 @@ if (advanced) {
fprintf(stderr, "\t --prune-tips [INT] \t\tprune all dead ends shorter than this value [1]\n");
fprintf(stderr, "\t --prune-unitigs [INT] \tprune all unitigs with median k-mer counts smaller\n"
"\t \t\tthan this value (0: auto) [1]\n");
fprintf(stderr, "\t --cleaning-theshold-percentile [FLOAT] the percentile of the k-mer count distribution to set as the cleaning threshold [0.001]\n");
fprintf(stderr, "\t --fallback [INT] \t\tfallback threshold if the automatic one cannot be\n"
"\t \t\tdetermined (-1: disables fallback) [1]\n");
fprintf(stderr, "\n");
Expand Down
1 change: 1 addition & 0 deletions metagraph/src/cli/config/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class Config {
double alignment_min_exact_match = 0.7;
double min_fraction = 0.0;
double max_fraction = 1.0;
double cleaning_threshold_percentile = 0.001;
std::vector<double> count_slice_quantiles;
std::vector<double> count_quantiles;

Expand Down
9 changes: 7 additions & 2 deletions metagraph/src/graph/graph_cleaning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,15 @@ bool is_unreliable_unitig(const std::vector<SequenceGraph::node_index> &path,


int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
double fdr_thres,
double *alpha_est_ptr, double *beta_est_ptr,
double *false_pos_ptr, double *false_neg_ptr);

// returns -1 if the automatic estimation fails
uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,
const NodeWeights &node_weights,
uint64_t num_singleton_kmers) {
uint64_t num_singleton_kmers,
double cleaning_threshold_percentile) {
std::vector<uint64_t> hist;
graph.call_nodes([&](auto i) {
uint64_t kmer_count = node_weights[i];
Expand All @@ -60,6 +62,7 @@ uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,

double alpha_est_ptr, beta_est_ptr, false_pos_ptr, false_neg_ptr;
return cleaning_pick_kmer_threshold(hist.data(), hist.size(),
cleaning_threshold_percentile,
&alpha_est_ptr, &beta_est_ptr,
&false_pos_ptr, &false_neg_ptr);
}
Expand Down Expand Up @@ -201,11 +204,13 @@ static inline bool is_cutoff_good(const uint64_t *kmer_covg, size_t arrlen,
*
* @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1
* @param arrlen Length of array kmer_covg
* @param fdr_thes Percentile of the kmer coverage histogram to use as the error threshold
* @param alpha_est_ptr If not NULL, used to return estimate for alpha
* @param beta_est_ptr If not NULL, used to return estimate for beta
* @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff
*/
int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,
double fdr_thres,
double *alpha_est_ptr, double *beta_est_ptr,
double *false_pos_ptr, double *false_neg_ptr)
{
Expand Down Expand Up @@ -285,7 +290,7 @@ int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen,

// Find cutoff by finding first coverage level where errors make up less than
// 0.1% of total coverage
cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, 0.001);
cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, fdr_thres);
// printf("A cutoff: %i\n", cutoff);

// Pick highest cutoff that keeps FP < FN
Expand Down
3 changes: 2 additions & 1 deletion metagraph/src/graph/graph_cleaning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ bool is_unreliable_unitig(const std::vector<SequenceGraph::node_index> &path,

uint64_t estimate_min_kmer_abundance(const DeBruijnGraph &graph,
const NodeWeights &node_weights,
uint64_t num_singleton_kmers = 0);
uint64_t num_singleton_kmers = 0,
double cleaning_threshold_percentile = 0.001);

} // namespace graph
} // namespace mtg
Expand Down

0 comments on commit 70fb7ff

Please sign in to comment.