From 05c4054b2f16ea49c04df767bbb875212f6001e9 Mon Sep 17 00:00:00 2001 From: Oliver Cheng Date: Thu, 2 Jan 2025 10:47:04 +0800 Subject: [PATCH] Added filter settings to `index` --- Cargo.toml | 1 + src/cli.rs | 109 ++++++++++++++++++++++++++++++++++++---- src/file.rs | 1 + src/filter.rs | 11 ++++ src/index.rs | 96 +++++++++++++++++++++++------------ src/main.rs | 17 ++++++- tests/correct/index.tsv | 3 +- 7 files changed, 192 insertions(+), 46 deletions(-) create mode 100644 src/filter.rs diff --git a/Cargo.toml b/Cargo.toml index ad02a29..99dd4b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ tempfile = "3.14.0" thiserror = "1.0.64" predicates = "3.1.2" itertools = "0.13.0" +indoc = "2.0.5" [[bin]] name = "nailpolish" diff --git a/src/cli.rs b/src/cli.rs index a58bcbc..b0e365a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -45,25 +45,45 @@ pub enum Commands { #[arg(long, default_value = "index.tsv")] index: String, - /// whether to use a file containing pre-clustered reads, with every line in one of two - /// formats: - /// READ_ID;BARCODE or, - /// READ_ID;BARCODE;UMI - #[arg(long)] + /// whether to use a file containing pre-clustered reads, with every line in one of two formats: + /// 1. READ_ID;BARCODE + /// 2. READ_ID;BARCODE;UMI + #[arg(long, verbatim_doc_comment)] clusters: Option, - /// barcode regex format type, for custom header styles. - /// This will override the preset given. For example: - /// ^@([ATCG]{16})_([ATCG]{12}) - /// for the BC-UMI preset. - #[arg(long)] + /// barcode regex format type, for custom header styles. this will override the preset given. + /// for example, for the `bc-umi` preset: + /// ^([ATCG]{16})_([ATCG]{12}) + #[arg(long, verbatim_doc_comment)] barcode_regex: Option, /// skip, instead of error, on reads which are not accounted for: /// - if a cluster file is passed, any reads which are not in any cluster /// - if a barcode regex or preset is used (default), any reads which do not match the regex - #[arg(long)] + #[arg(long, verbatim_doc_comment)] skip_unmatched: bool, + + /// filter lengths to a value within the given float interval [a,b]. + /// a is the minimum, and b is the maximum (both inclusive). + /// alternatively, a can be `-inf` and b can be `inf. + /// an unbounded interval (i.e. no length filter) is given by `0,inf`. + #[arg( + long, + value_parser = |x: &str| ArgInterval::try_from(x), + default_value = "0,15000", + verbatim_doc_comment + )] + len: ArgInterval, + + /// filter average read quality to a value within the given float interval [a,b]. + /// see the docs for `--len` for documentation on how to use the interval. + #[arg( + long, + value_parser = |x: &str| ArgInterval::try_from(x), + default_value = "0,inf", + verbatim_doc_comment + )] + qual: ArgInterval, }, /// Generate a summary of duplicate statistics from an index file @@ -121,3 +141,70 @@ pub enum Commands { output: Option, }, } + +#[derive(Copy, Clone, Debug)] +pub struct ArgInterval { + pub min: f64, + pub max: f64, +} + +/// Error type for parsing an interval string. +#[derive(Debug)] +pub struct ParseIntervalErr(String); + +impl std::fmt::Display for ParseIntervalErr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Invalid interval format: {}", self.0) + } +} + +impl std::error::Error for ParseIntervalErr {} + +impl<'a> TryFrom<&'a str> for ArgInterval { + type Error = ParseIntervalErr; + + fn try_from(arg: &'a str) -> Result { + let arg_lc = arg.to_lowercase(); + let parts: Vec<&str> = arg_lc.split(',').collect(); + + if parts.len() != 2 { + return Err(ParseIntervalErr(indoc::formatdoc! {" + Expected format ',', got '{arg}'. The expected format is \ + `a,b`, as in: + --len 0,15000 + --len 0,inf + --len 100,15000 + "})); + } + + // Try to parse the minimum and maximum, handling unbounded cases. + let min = match parts[0].trim() { + "-inf" => f64::NEG_INFINITY, + s => s.parse::().map_err(|_| { + ParseIntervalErr(format!( + "Invalid minimum value: '{}' (should be any float or `-inf`)", + parts[0].trim() + )) + })?, + }; + + let max = match parts[1].trim() { + "inf" => f64::INFINITY, + s => s.parse::().map_err(|_| { + ParseIntervalErr(format!( + "Invalid maximum value: '{}' (should be any float or `inf`)", + parts[1].trim() + )) + })?, + }; + + Ok(ArgInterval { min, max }) + } +} + +impl ArgInterval { + pub fn contains(&self, v: f64) -> bool { + let v = v as f64; + (self.min < v) && (v < self.max) + } +} diff --git a/src/file.rs b/src/file.rs index e49dc2e..217eefe 100644 --- a/src/file.rs +++ b/src/file.rs @@ -12,4 +12,5 @@ pub struct FastqFile { pub read_count: usize, pub avg_qual: f64, pub avg_len: f64, + pub filtered_reads: usize, } diff --git a/src/filter.rs b/src/filter.rs new file mode 100644 index 0000000..bf05f1f --- /dev/null +++ b/src/filter.rs @@ -0,0 +1,11 @@ +use crate::cli::ArgInterval; +use crate::io::Record; + +pub struct FilterOpts { + pub len: ArgInterval, + pub quality: ArgInterval, +} + +pub fn filter(read: &Record, opts: &FilterOpts) -> bool { + opts.len.contains(read.len() as f64) && opts.quality.contains(read.phred_quality_avg()) +} diff --git a/src/index.rs b/src/index.rs index 8d9a9d8..6da5f4d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -14,6 +14,7 @@ use thiserror::Error; use crate::duplicates::RecordIdentifier; use crate::file::FastqFile; +use crate::filter::{filter, FilterOpts}; use crate::io::Record; use tempfile::tempfile_in; @@ -102,6 +103,7 @@ fn iter_lines_with_regex( re: &Regex, skip_invalid_ids: bool, mut info: FastqFile, + filter_opts: FilterOpts, ) -> Result { // expected_len is used to ensure that every read has the same format let mut expected_len: Option = None; @@ -122,34 +124,44 @@ fn iter_lines_with_regex( let file_len = sequence_rec.all().len() + 1; let mut rec = Record::try_from(sequence_rec)?; - match extract_bc_from_header(&rec.id, re, position) { - Ok((len, identifier)) => { - let expected_len = *expected_len.get_or_insert(len); - - if expected_len != len { - bail!(IndexGenerationErr::DifferentMatchCounts { - header: rec.id, - re: re.clone(), - pos: position, - count: len, - expected: expected_len - }) - } - - rec.id = identifier.to_string(); - - rec.write_index(wtr, position, file_len)?; - total_quality += rec.phred_quality_total(); - total_len += rec.len(); - info.matched_read_count += 1; - } - Err(e) => { - if !skip_invalid_ids { - bail!(e) - } - info.unmatched_read_count += 1; + // apply any filters + let should_keep = filter(&rec, &filter_opts); + if !should_keep { + info.filtered_reads += 1; + continue; + } + + let bc = extract_bc_from_header(&rec.id, re, position); + + // if this did not succeed... + if let Err(e) = bc { + if !skip_invalid_ids { + bail!(e) } - }; + info.unmatched_read_count += 1; + continue; + } + + let (len, identifier) = bc?; + + // check that the number of barcode groups is the same + let expected_len = *expected_len.get_or_insert(len); + if expected_len != len { + bail!(IndexGenerationErr::DifferentMatchCounts { + header: rec.id, + re: re.clone(), + pos: position, + count: len, + expected: expected_len + }) + } + + rec.id = identifier.to_string(); + + rec.write_index(wtr, position, file_len)?; + total_quality += rec.phred_quality_total(); + total_len += rec.len(); + info.matched_read_count += 1; } wtr.flush()?; @@ -187,6 +199,7 @@ fn iter_lines_with_cluster_file( clusters: &mut Reader, skip_invalid_ids: bool, mut info: FastqFile, + filter_opts: FilterOpts, ) -> Result { // first, we will read the clusters file info!("Reading identifiers from clusters file..."); @@ -236,6 +249,13 @@ fn iter_lines_with_cluster_file( let file_len = sequence_rec.all().len() + 1; let mut rec = Record::try_from(sequence_rec)?; + // apply any filters + let should_keep = filter(&rec, &filter_opts); + if !should_keep { + info.filtered_reads += 1; + continue; + } + let Some(identifier) = cluster_map.get(&rec.id) else { if !skip_invalid_ids { bail!(RowNotInClusters { header: rec.id }) @@ -335,6 +355,7 @@ pub fn construct_index( barcode_regex: &str, skip_unmatched: bool, clusters: &Option, + filter_opts: FilterOpts, ) -> Result<()> { // time everything! let now = std::time::Instant::now(); @@ -374,7 +395,14 @@ pub fn construct_index( let re = Regex::new(barcode_regex)?; let mut result = match clusters { // no cluster file has been used - None => iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info), + None => iter_lines_with_regex( + reader, + &mut wtr, + &re, + skip_unmatched, + file_info, + filter_opts, + ), // cluster file is being used Some(filepath) => { @@ -389,6 +417,7 @@ pub fn construct_index( &mut cluster_rdr, skip_unmatched, file_info, + filter_opts, ) } }?; @@ -399,13 +428,16 @@ pub fn construct_index( // report results if skip_unmatched { info!( - "Stats: {} matched reads, {} unmatched reads, {:.1}s runtime", - result.matched_read_count, result.unmatched_read_count, result.elapsed, + "Stats: {} matched reads, {} unmatched reads, {} filtered reads, {:.1}s runtime", + result.matched_read_count, + result.unmatched_read_count, + result.filtered_reads, + result.elapsed, ) } else { info!( - "Stats: {} reads, {:.1}s runtime", - result.matched_read_count, result.elapsed + "Stats: {} reads, {} filtered reads, {:.1}s runtime", + result.matched_read_count, result.filtered_reads, result.elapsed ) } diff --git a/src/main.rs b/src/main.rs index d292b4d..7f2bdcc 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,7 @@ mod call; mod cli; mod duplicates; mod file; +mod filter; mod group; mod index; mod io; @@ -70,6 +71,8 @@ fn try_main() -> Result<()> { barcode_regex, clusters, skip_unmatched, + len, + qual, } => { let barcode_regex = match barcode_regex { Some(v) => { @@ -83,7 +86,19 @@ fn try_main() -> Result<()> { } }; - index::construct_index(file, index, &barcode_regex, *skip_unmatched, clusters)?; + let filter_opts = filter::FilterOpts { + len: len.clone(), + quality: qual.clone(), + }; + + index::construct_index( + file, + index, + &barcode_regex, + *skip_unmatched, + clusters, + filter_opts, + )?; info!("Completed index generation to {index}"); } Commands::Call { diff --git a/tests/correct/index.tsv b/tests/correct/index.tsv index afbea74..89cc9fb 100644 --- a/tests/correct/index.tsv +++ b/tests/correct/index.tsv @@ -1,4 +1,4 @@ -#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728} +#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-02T10:46:32.316021+08:00","elapsed":0.526168292,"gb":0.02817634679377079,"matched_read_count":14142,"unmatched_read_count":0,"read_count":14143,"avg_qual":21274.992292462168,"avg_len":1009.505232640362,"filtered_reads":1} id pos avg_qual n_bases rec_len TCTGGCTCATTCTCCG_GCAGCGAAGCCC 0 19.47 593 1264 GCACTAAGTTGAAGTA_ATCAATGTATTG 1264 9.52 1655 3388 @@ -10862,7 +10862,6 @@ GATTCGATCAAGCCTA_TCCTCATTTGTC 22992098 18.4 1405 2888 GATAGCTAGCAACAAT_TACCCTTTTCGT 22994986 23.55 2100 4278 TTGAGTGCAGGGCTTC_AAGAAGTATTTC 22999264 21.21 772 1622 GCCATGGTCGTCACCT_TTTACCGGAGTT 23000886 25.0 2720 5518 -TTTAGTCCAGCGGATA_AATACACGGTGT 23006404 80.12 299061 598200 TTTAGTCCAGCGGATA_AATACACGGTGT 23604604 15.35 1026 2130 GGAGGATTCTTCTAAC_TGTTCTTGAAGC 23606734 16.56 1128 2334 GGAGGATTCTTCTAAC_TGTTCTTGAAGC 23609068 17.3 1178 2434