Skip to content

Commit

Permalink
Added filter settings to index
Browse files Browse the repository at this point in the history
  • Loading branch information
olliecheng committed Jan 2, 2025
1 parent 425739f commit 05c4054
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 46 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ tempfile = "3.14.0"
thiserror = "1.0.64"
predicates = "3.1.2"
itertools = "0.13.0"
indoc = "2.0.5"

[[bin]]
name = "nailpolish"
Expand Down
109 changes: 98 additions & 11 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,45 @@ pub enum Commands {
#[arg(long, default_value = "index.tsv")]
index: String,

/// whether to use a file containing pre-clustered reads, with every line in one of two
/// formats:
/// READ_ID;BARCODE or,
/// READ_ID;BARCODE;UMI
#[arg(long)]
/// whether to use a file containing pre-clustered reads, with every line in one of two formats:
/// 1. READ_ID;BARCODE
/// 2. READ_ID;BARCODE;UMI
#[arg(long, verbatim_doc_comment)]
clusters: Option<String>,

/// barcode regex format type, for custom header styles.
/// This will override the preset given. For example:
/// ^@([ATCG]{16})_([ATCG]{12})
/// for the BC-UMI preset.
#[arg(long)]
/// barcode regex format type, for custom header styles. this will override the preset given.
/// for example, for the `bc-umi` preset:
/// ^([ATCG]{16})_([ATCG]{12})
#[arg(long, verbatim_doc_comment)]
barcode_regex: Option<String>,

/// skip, instead of error, on reads which are not accounted for:
/// - if a cluster file is passed, any reads which are not in any cluster
/// - if a barcode regex or preset is used (default), any reads which do not match the regex
#[arg(long)]
#[arg(long, verbatim_doc_comment)]
skip_unmatched: bool,

/// filter lengths to a value within the given float interval [a,b].
/// a is the minimum, and b is the maximum (both inclusive).
/// alternatively, a can be `-inf` and b can be `inf.
/// an unbounded interval (i.e. no length filter) is given by `0,inf`.
#[arg(
long,
value_parser = |x: &str| ArgInterval::try_from(x),
default_value = "0,15000",
verbatim_doc_comment
)]
len: ArgInterval,

/// filter average read quality to a value within the given float interval [a,b].
/// see the docs for `--len` for documentation on how to use the interval.
#[arg(
long,
value_parser = |x: &str| ArgInterval::try_from(x),
default_value = "0,inf",
verbatim_doc_comment
)]
qual: ArgInterval,
},

/// Generate a summary of duplicate statistics from an index file
Expand Down Expand Up @@ -121,3 +141,70 @@ pub enum Commands {
output: Option<String>,
},
}

#[derive(Copy, Clone, Debug)]
pub struct ArgInterval {
pub min: f64,
pub max: f64,
}

/// Error type for parsing an interval string.
#[derive(Debug)]
pub struct ParseIntervalErr(String);

impl std::fmt::Display for ParseIntervalErr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Invalid interval format: {}", self.0)
}
}

impl std::error::Error for ParseIntervalErr {}

impl<'a> TryFrom<&'a str> for ArgInterval {
type Error = ParseIntervalErr;

fn try_from(arg: &'a str) -> Result<ArgInterval, Self::Error> {
let arg_lc = arg.to_lowercase();
let parts: Vec<&str> = arg_lc.split(',').collect();

if parts.len() != 2 {
return Err(ParseIntervalErr(indoc::formatdoc! {"
Expected format '<min>,<max>', got '{arg}'. The expected format is \
`a,b`, as in:
--len 0,15000
--len 0,inf
--len 100,15000
"}));
}

// Try to parse the minimum and maximum, handling unbounded cases.
let min = match parts[0].trim() {
"-inf" => f64::NEG_INFINITY,
s => s.parse::<f64>().map_err(|_| {
ParseIntervalErr(format!(
"Invalid minimum value: '{}' (should be any float or `-inf`)",
parts[0].trim()
))
})?,
};

let max = match parts[1].trim() {
"inf" => f64::INFINITY,
s => s.parse::<f64>().map_err(|_| {
ParseIntervalErr(format!(
"Invalid maximum value: '{}' (should be any float or `inf`)",
parts[1].trim()
))
})?,
};

Ok(ArgInterval { min, max })
}
}

impl ArgInterval {
pub fn contains(&self, v: f64) -> bool {
let v = v as f64;
(self.min < v) && (v < self.max)
}
}
1 change: 1 addition & 0 deletions src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ pub struct FastqFile {
pub read_count: usize,
pub avg_qual: f64,
pub avg_len: f64,
pub filtered_reads: usize,
}
11 changes: 11 additions & 0 deletions src/filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use crate::cli::ArgInterval;
use crate::io::Record;

pub struct FilterOpts {
pub len: ArgInterval,
pub quality: ArgInterval,
}

pub fn filter(read: &Record, opts: &FilterOpts) -> bool {
opts.len.contains(read.len() as f64) && opts.quality.contains(read.phred_quality_avg())
}
96 changes: 64 additions & 32 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use thiserror::Error;

use crate::duplicates::RecordIdentifier;
use crate::file::FastqFile;
use crate::filter::{filter, FilterOpts};
use crate::io::Record;
use tempfile::tempfile_in;

Expand Down Expand Up @@ -102,6 +103,7 @@ fn iter_lines_with_regex<W: Write>(
re: &Regex,
skip_invalid_ids: bool,
mut info: FastqFile,
filter_opts: FilterOpts,
) -> Result<FastqFile> {
// expected_len is used to ensure that every read has the same format
let mut expected_len: Option<usize> = None;
Expand All @@ -122,34 +124,44 @@ fn iter_lines_with_regex<W: Write>(
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

match extract_bc_from_header(&rec.id, re, position) {
Ok((len, identifier)) => {
let expected_len = *expected_len.get_or_insert(len);

if expected_len != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: rec.id,
re: re.clone(),
pos: position,
count: len,
expected: expected_len
})
}

rec.id = identifier.to_string();

rec.write_index(wtr, position, file_len)?;
total_quality += rec.phred_quality_total();
total_len += rec.len();
info.matched_read_count += 1;
}
Err(e) => {
if !skip_invalid_ids {
bail!(e)
}
info.unmatched_read_count += 1;
// apply any filters
let should_keep = filter(&rec, &filter_opts);
if !should_keep {
info.filtered_reads += 1;
continue;
}

let bc = extract_bc_from_header(&rec.id, re, position);

// if this did not succeed...
if let Err(e) = bc {
if !skip_invalid_ids {
bail!(e)
}
};
info.unmatched_read_count += 1;
continue;
}

let (len, identifier) = bc?;

// check that the number of barcode groups is the same
let expected_len = *expected_len.get_or_insert(len);
if expected_len != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: rec.id,
re: re.clone(),
pos: position,
count: len,
expected: expected_len
})
}

rec.id = identifier.to_string();

rec.write_index(wtr, position, file_len)?;
total_quality += rec.phred_quality_total();
total_len += rec.len();
info.matched_read_count += 1;
}

wtr.flush()?;
Expand Down Expand Up @@ -187,6 +199,7 @@ fn iter_lines_with_cluster_file<W: Write>(
clusters: &mut Reader<File>,
skip_invalid_ids: bool,
mut info: FastqFile,
filter_opts: FilterOpts,
) -> Result<FastqFile> {
// first, we will read the clusters file
info!("Reading identifiers from clusters file...");
Expand Down Expand Up @@ -236,6 +249,13 @@ fn iter_lines_with_cluster_file<W: Write>(
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

// apply any filters
let should_keep = filter(&rec, &filter_opts);
if !should_keep {
info.filtered_reads += 1;
continue;
}

let Some(identifier) = cluster_map.get(&rec.id) else {
if !skip_invalid_ids {
bail!(RowNotInClusters { header: rec.id })
Expand Down Expand Up @@ -335,6 +355,7 @@ pub fn construct_index(
barcode_regex: &str,
skip_unmatched: bool,
clusters: &Option<String>,
filter_opts: FilterOpts,
) -> Result<()> {
// time everything!
let now = std::time::Instant::now();
Expand Down Expand Up @@ -374,7 +395,14 @@ pub fn construct_index(
let re = Regex::new(barcode_regex)?;
let mut result = match clusters {
// no cluster file has been used
None => iter_lines_with_regex(reader, &mut wtr, &re, skip_unmatched, file_info),
None => iter_lines_with_regex(
reader,
&mut wtr,
&re,
skip_unmatched,
file_info,
filter_opts,
),

// cluster file is being used
Some(filepath) => {
Expand All @@ -389,6 +417,7 @@ pub fn construct_index(
&mut cluster_rdr,
skip_unmatched,
file_info,
filter_opts,
)
}
}?;
Expand All @@ -399,13 +428,16 @@ pub fn construct_index(
// report results
if skip_unmatched {
info!(
"Stats: {} matched reads, {} unmatched reads, {:.1}s runtime",
result.matched_read_count, result.unmatched_read_count, result.elapsed,
"Stats: {} matched reads, {} unmatched reads, {} filtered reads, {:.1}s runtime",
result.matched_read_count,
result.unmatched_read_count,
result.filtered_reads,
result.elapsed,
)
} else {
info!(
"Stats: {} reads, {:.1}s runtime",
result.matched_read_count, result.elapsed
"Stats: {} reads, {} filtered reads, {:.1}s runtime",
result.matched_read_count, result.filtered_reads, result.elapsed
)
}

Expand Down
17 changes: 16 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ mod call;
mod cli;
mod duplicates;
mod file;
mod filter;
mod group;
mod index;
mod io;
Expand Down Expand Up @@ -70,6 +71,8 @@ fn try_main() -> Result<()> {
barcode_regex,
clusters,
skip_unmatched,
len,
qual,
} => {
let barcode_regex = match barcode_regex {
Some(v) => {
Expand All @@ -83,7 +86,19 @@ fn try_main() -> Result<()> {
}
};

index::construct_index(file, index, &barcode_regex, *skip_unmatched, clusters)?;
let filter_opts = filter::FilterOpts {
len: len.clone(),
quality: qual.clone(),
};

index::construct_index(
file,
index,
&barcode_regex,
*skip_unmatched,
clusters,
filter_opts,
)?;
info!("Completed index generation to {index}");
}
Commands::Call {
Expand Down
3 changes: 1 addition & 2 deletions tests/correct/index.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728}
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-02T10:46:32.316021+08:00","elapsed":0.526168292,"gb":0.02817634679377079,"matched_read_count":14142,"unmatched_read_count":0,"read_count":14143,"avg_qual":21274.992292462168,"avg_len":1009.505232640362,"filtered_reads":1}
id pos avg_qual n_bases rec_len
TCTGGCTCATTCTCCG_GCAGCGAAGCCC 0 19.47 593 1264
GCACTAAGTTGAAGTA_ATCAATGTATTG 1264 9.52 1655 3388
Expand Down Expand Up @@ -10862,7 +10862,6 @@ GATTCGATCAAGCCTA_TCCTCATTTGTC 22992098 18.4 1405 2888
GATAGCTAGCAACAAT_TACCCTTTTCGT 22994986 23.55 2100 4278
TTGAGTGCAGGGCTTC_AAGAAGTATTTC 22999264 21.21 772 1622
GCCATGGTCGTCACCT_TTTACCGGAGTT 23000886 25.0 2720 5518
TTTAGTCCAGCGGATA_AATACACGGTGT 23006404 80.12 299061 598200
TTTAGTCCAGCGGATA_AATACACGGTGT 23604604 15.35 1026 2130
GGAGGATTCTTCTAAC_TGTTCTTGAAGC 23606734 16.56 1128 2334
GGAGGATTCTTCTAAC_TGTTCTTGAAGC 23609068 17.3 1178 2434
Expand Down

0 comments on commit 05c4054

Please sign in to comment.