From 05c4054b2f16ea49c04df767bbb875212f6001e9 Mon Sep 17 00:00:00 2001
From: Oliver Cheng <cheng.o@wehi.edu.au>
Date: Thu, 2 Jan 2025 10:47:04 +0800
Subject: [PATCH] Added filter settings to `index`

---
 Cargo.toml              |   1 +
 src/cli.rs              | 109 ++++++++++++++++++++++++++++++++++++----
 src/file.rs             |   1 +
 src/filter.rs           |  11 ++++
 src/index.rs            |  96 +++++++++++++++++++++++------------
 src/main.rs             |  17 ++++++-
 tests/correct/index.tsv |   3 +-
 7 files changed, 192 insertions(+), 46 deletions(-)
 create mode 100644 src/filter.rs
diff --git a/Cargo.toml b/Cargo.toml
index ad02a29..99dd4b3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,7 @@ tempfile = "3.14.0"
 thiserror = "1.0.64"
 predicates = "3.1.2"
 itertools = "0.13.0"
+indoc = "2.0.5"
 
 [[bin]]
 name = "nailpolish"
diff --git a/src/cli.rs b/src/cli.rs
index a58bcbc..b0e365a 100644
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -45,25 +45,45 @@ pub enum Commands {
         #[arg(long, default_value = "index.tsv")]
         index: String,
 
-        /// whether to use a file containing pre-clustered reads, with every line in one of two
-        /// formats:
-        ///   READ_ID;BARCODE     or,
-        ///   READ_ID;BARCODE;UMI
-        #[arg(long)]
+        /// whether to use a file containing pre-clustered reads, with every line in one of two formats:
+        ///   1. READ_ID;BARCODE
+        ///   2. READ_ID;BARCODE;UMI
+        #[arg(long, verbatim_doc_comment)]
         clusters: Option<String>,
 
-        /// barcode regex format type, for custom header styles.
-        /// This will override the preset given. For example:
-        ///     ^@([ATCG]{16})_([ATCG]{12})
-        /// for the BC-UMI preset.
-        #[arg(long)]
+        /// barcode regex format type, for custom header styles. this will override the preset given.
+        /// for example, for the `bc-umi` preset:
+        ///     ^([ATCG]{16})_([ATCG]{12})
+        #[arg(long, verbatim_doc_comment)]
         barcode_regex: Option<String>,
 
         /// skip, instead of error, on reads which are not accounted for:
         /// - if a cluster file is passed, any reads which are not in any cluster
         /// - if a barcode regex or preset is used (default), any reads which do not match the regex
-        #[arg(long)]
+        #[arg(long, verbatim_doc_comment)]
         skip_unmatched: bool,
+
+        /// filter lengths to a value within the given float interval [a,b].
+        /// a is the minimum, and b is the maximum (both inclusive).
+        /// alternatively, a can be `-inf` and b can be `inf.
+        /// an unbounded interval (i.e. no length filter) is given by `0,inf`.
+        #[arg(
+            long,
+            value_parser = |x: &str| ArgInterval::try_from(x),
+            default_value = "0,15000",
+            verbatim_doc_comment
+        )]
+        len: ArgInterval,
+
+        /// filter average read quality to a value within the given float interval [a,b].
+        /// see the docs for `--len` for documentation on how to use the interval.
+        #[arg(
+            long,
+            value_parser = |x: &str| ArgInterval::try_from(x),
+            default_value = "0,inf",
+            verbatim_doc_comment
+        )]
+        qual: ArgInterval,
     },
 
     /// Generate a summary of duplicate statistics from an index file
@@ -121,3 +141,70 @@ pub enum Commands {
         output: Option<String>,
     },
 }
+
+#[derive(Copy, Clone, Debug)]
+pub struct ArgInterval {
+    pub min: f64,
+    pub max: f64,
+}
+
+/// Error type for parsing an interval string.
+#[derive(Debug)]
+pub struct ParseIntervalErr(String);
+
+impl std::fmt::Display for ParseIntervalErr {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Invalid interval format: {}", self.0)
+    }
+}
+
+impl std::error::Error for ParseIntervalErr {}
+
+impl<'a> TryFrom<&'a str> for ArgInterval {
+    type Error = ParseIntervalErr;
+
+    fn try_from(arg: &'a str) -> Result<ArgInterval, Self::Error> {
+        let arg_lc = arg.to_lowercase();
+        let parts: Vec<&str> = arg_lc.split(',').collect();
+
+        if parts.len() != 2 {
+            return Err(ParseIntervalErr(indoc::formatdoc! {"
+            Expected format '<min>,<max>', got '{arg}'. The expected format is \
+            `a,b`, as in:
+              --len 0,15000
+              --len 0,inf
+              --len 100,15000
+            "}));
+        }
+
+        // Try to parse the minimum and maximum, handling unbounded cases.
+        let min = match parts[0].trim() {
+            "-inf" => f64::NEG_INFINITY,
+            s => s.parse::<f64>().map_err(|_| {
+                ParseIntervalErr(format!(
+                    "Invalid minimum value: '{}' (should be any float or `-inf`)",
+                    parts[0].trim()
+                ))
+            })?,
+        };
+
+        let max = match parts[1].trim() {
+            "inf" => f64::INFINITY,
+            s => s.parse::<f64>().map_err(|_| {
+                ParseIntervalErr(format!(
+                    "Invalid maximum value: '{}' (should be any float or `inf`)",
+                    parts[1].trim()
+                ))
+            })?,
+        };
+
+        Ok(ArgInterval { min, max })
+    }
+}
+
+impl ArgInterval {
+    pub fn contains(&self, v: f64) -> bool {
+        let v = v as f64;
+        (self.min < v) && (v < self.max)
+    }
+}
diff --git a/src/file.rs b/src/file.rs
index e49dc2e..217eefe 100644
--- a/src/file.rs
+++ b/src/file.rs
@@ -12,4 +12,5 @@ pub struct FastqFile {
     pub read_count: usize,
     pub avg_qual: f64,
     pub avg_len: f64,
+    pub filtered_reads: usize,
 }
diff --git a/src/filter.rs b/src/filter.rs
new file mode 100644
index 0000000..bf05f1f
--- /dev/null
+++ b/src/filter.rs
@@ -0,0 +1,11 @@
+use crate::cli::ArgInterval;
+use crate::io::Record;
+
+pub struct FilterOpts {
+    pub len: ArgInterval,
+    pub quality: ArgInterval,
+}
+
+pub fn filter(read: &Record, opts: &FilterOpts) -> bool {
+    opts.len.contains(read.len() as f64) && opts.quality.contains(read.phred_quality_avg())
+}
diff --git a/src/index.rs b/src/index.rs
index 8d9a9d8..6da5f4d 100644
--- a/src/index.rs
+++ b/src/index.rs
@@ -14,6 +14,7 @@ use thiserror::Error;
 
 use crate::duplicates::RecordIdentifier;
 use crate::file::FastqFile;
+use crate::filter::{filter, FilterOpts};
 use crate::io::Record;
 use tempfile::tempfile_in;
 
@@ -102,6 +103,7 @@ fn iter_lines_with_regex<W: Write>(
     re: &Regex,
     skip_invalid_ids: bool,
     mut info: FastqFile,
+    filter_opts: FilterOpts,
 ) -> Result<FastqFile> {
     // expected_len is used to ensure that every read has the same format
     let mut expected_len: Option<usize> = None;
@@ -122,34 +124,44 @@ fn iter_lines_with_regex<W: Write>(
         let file_len = sequence_rec.all().len() + 1;
         let mut rec = Record::try_from(sequence_rec)?;
 
-        match extract_bc_from_header(&rec.id, re, position) {
-            Ok((len, identifier)) => {
-                let expected_len = *expected_len.get_or_insert(len);
-
-                if expected_len != len {
-                    bail!(IndexGenerationErr::DifferentMatchCounts {
-                        header: rec.id,
-                        re: re.clone(),
-                        pos: position,
-                        count: len,
-                        expected: expected_len
-                    })
-                }
-
-                rec.id = identifier.to_string();
-
-                rec.write_index(wtr, position, file_len)?;
-                total_quality += rec.phred_quality_total();
-                total_len += rec.len();
-                info.matched_read_count += 1;
-            }
-            Err(e) => {
-                if !skip_invalid_ids {
-                    bail!(e)
-                }
-                info.unmatched_read_count += 1;
+        // apply any filters
+        let should_keep = filter(&rec, &filter_opts);
+        if !should_keep {
+            info.filtered_reads += 1;
+            continue;
+        }
+
+        let bc = extract_bc_from_header(&rec.id, re, position);
+
+        // if this did not succeed...
+        if let Err(e) = bc {
+            if !skip_invalid_ids {
+                bail!(e)
             }
-        };
+            info.unmatched_read_count += 1;
+            continue;
+        }
+
+        let (len, identifier) = bc?;
+
+        // check that the number of barcode groups is the same
+        let expected_len = *expected_len.get_or_insert(len);
+        if expected_len != len {
+            bail!(IndexGenerationErr::DifferentMatchCounts {
+                header: rec.id,
+                re: re.clone(),
+                pos: position,
+                count: len,
+                expected: expected_len
+            })
+        }
+
+        rec.id = identifier.to_string();
+
+        rec.write_index(wtr, position, file_len)?;
+        total_quality += rec.phred_quality_total();
+        total_len += rec.len();
+        info.matched_read_count += 1;
     }
 
     wtr.flush()?;
@@ -187,6 +199,7 @@ fn iter_lines_with_cluster_file<W: Write>(
     clusters: &mut Reader<File>,
     skip_invalid_ids: bool,
     mut info: FastqFile,
+    filter_opts: FilterOpts,
 ) -> Result<FastqFile> {
     // first, we will read the clusters file
     info!("Reading identifiers from clusters file...");
@@ -236,6 +249,13 @@ fn iter_lines_with_cluster_file<W: Write>(
         let file_len = sequence_rec.all().len() + 1;
         let mut rec = Record::try_from(sequence_rec)?;
 
+        // apply any filters
+        let should_keep = filter(&rec, &filter_opts);
+        if !should_keep {
+            info.filtered_reads += 1;
+            continue;
+        }
+
         let Some(identifier) = cluster_map.get(&rec.id) else {
             if !skip_invalid_ids {
                 bail!(RowNotInClusters { header: rec.id })
@@ -335,6 +355,7 @@ pub fn construct_index(
     barcode_regex: &str,
     skip_unmatched: bool,
     clusters: &Option<String>,
+    filter_opts: FilterOpts,
 ) -> Result<()> {
     // time everything!
     let now = std::time::Instant::now();
@@ -374,7 +395,14 @@ pub fn construct_index(
     let re = Regex::new(barcode_regex)?;
     let mut result = match clusters {
         // no cluster file has been used
-        None => iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info),
+        None => iter_lines_with_regex(
+            reader,
+            &mut wtr,
+            &re,
+            skip_unmatched,
+            file_info,
+            filter_opts,
+        ),
 
         // cluster file is being used
         Some(filepath) => {
@@ -389,6 +417,7 @@ pub fn construct_index(
                 &mut cluster_rdr,
                 skip_unmatched,
                 file_info,
+                filter_opts,
             )
         }
     }?;
@@ -399,13 +428,16 @@ pub fn construct_index(
     // report results
     if skip_unmatched {
         info!(
-            "Stats: {} matched reads, {} unmatched reads, {:.1}s runtime",
-            result.matched_read_count, result.unmatched_read_count, result.elapsed,
+            "Stats: {} matched reads, {} unmatched reads, {} filtered reads, {:.1}s runtime",
+            result.matched_read_count,
+            result.unmatched_read_count,
+            result.filtered_reads,
+            result.elapsed,
         )
     } else {
         info!(
-            "Stats: {} reads, {:.1}s runtime",
-            result.matched_read_count, result.elapsed
+            "Stats: {} reads, {} filtered reads, {:.1}s runtime",
+            result.matched_read_count, result.filtered_reads, result.elapsed
         )
     }
 
diff --git a/src/main.rs b/src/main.rs
index d292b4d..7f2bdcc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,7 @@ mod call;
 mod cli;
 mod duplicates;
 mod file;
+mod filter;
 mod group;
 mod index;
 mod io;
@@ -70,6 +71,8 @@ fn try_main() -> Result<()> {
             barcode_regex,
             clusters,
             skip_unmatched,
+            len,
+            qual,
         } => {
             let barcode_regex = match barcode_regex {
                 Some(v) => {
@@ -83,7 +86,19 @@ fn try_main() -> Result<()> {
                 }
             };
 
-            index::construct_index(file, index, &barcode_regex, *skip_unmatched, clusters)?;
+            let filter_opts = filter::FilterOpts {
+                len: len.clone(),
+                quality: qual.clone(),
+            };
+
+            index::construct_index(
+                file,
+                index,
+                &barcode_regex,
+                *skip_unmatched,
+                clusters,
+                filter_opts,
+            )?;
             info!("Completed index generation to {index}");
         }
         Commands::Call {
diff --git a/tests/correct/index.tsv b/tests/correct/index.tsv
index afbea74..89cc9fb 100644
--- a/tests/correct/index.tsv
+++ b/tests/correct/index.tsv
@@ -1,4 +1,4 @@
-#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728}
+#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-02T10:46:32.316021+08:00","elapsed":0.526168292,"gb":0.02817634679377079,"matched_read_count":14142,"unmatched_read_count":0,"read_count":14143,"avg_qual":21274.992292462168,"avg_len":1009.505232640362,"filtered_reads":1}
 id	pos	avg_qual	n_bases	rec_len
 TCTGGCTCATTCTCCG_GCAGCGAAGCCC	0	19.47	593	1264
 GCACTAAGTTGAAGTA_ATCAATGTATTG	1264	9.52	1655	3388
@@ -10862,7 +10862,6 @@ GATTCGATCAAGCCTA_TCCTCATTTGTC	22992098	18.4	1405	2888
 GATAGCTAGCAACAAT_TACCCTTTTCGT	22994986	23.55	2100	4278
 TTGAGTGCAGGGCTTC_AAGAAGTATTTC	22999264	21.21	772	1622
 GCCATGGTCGTCACCT_TTTACCGGAGTT	23000886	25.0	2720	5518
-TTTAGTCCAGCGGATA_AATACACGGTGT	23006404	80.12	299061	598200
 TTTAGTCCAGCGGATA_AATACACGGTGT	23604604	15.35	1026	2130
 GGAGGATTCTTCTAAC_TGTTCTTGAAGC	23606734	16.56	1128	2334
 GGAGGATTCTTCTAAC_TGTTCTTGAAGC	23609068	17.3	1178	2434