Skip to content

Commit

Permalink
Refactor index to use Record and add helper methods to Record
Browse files Browse the repository at this point in the history
olliecheng committed Jan 1, 2025
1 parent fd51e58 commit 425739f
Showing 3 changed files with 115 additions and 40 deletions.
68 changes: 36 additions & 32 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@ use thiserror::Error;

use crate::duplicates::RecordIdentifier;
use crate::file::FastqFile;
use crate::io::Record;
use tempfile::tempfile_in;

#[derive(Debug, Serialize, Deserialize)]
@@ -106,7 +107,7 @@ fn iter_lines_with_regex<W: Write>(
let mut expected_len: Option<usize> = None;

let mut fastq_reader = needletail::parser::FastqReader::new(reader);
let mut total_quality = 0f64;
let mut total_quality = 0u32;
let mut total_len = 0;

while let Some(rec) = fastq_reader.next() {
@@ -116,29 +117,30 @@ fn iter_lines_with_regex<W: Write>(
info!("Processed: {}", info.read_count)
}

let rec = rec.expect("Invalid record");
let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
let position = rec.position().byte() as usize;
let sequence_rec = rec.expect("Invalid record");
let position = sequence_rec.position().byte() as usize;
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

match extract_bc_from_header(id, re, position) {
match extract_bc_from_header(&rec.id, re, position) {
Ok((len, identifier)) => {
match expected_len {
None => expected_len = Some(len),
Some(expected) => {
if expected != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: id.to_string(),
re: re.clone(),
pos: position,
count: len,
expected
})
}
}
let expected_len = *expected_len.get_or_insert(len);

if expected_len != len {
bail!(IndexGenerationErr::DifferentMatchCounts {
header: rec.id,
re: re.clone(),
pos: position,
count: len,
expected: expected_len
})
}

total_quality += write_read(wtr, &rec, identifier.to_string(), position)?;
total_len += rec.num_bases();
rec.id = identifier.to_string();

rec.write_index(wtr, position, file_len)?;
total_quality += rec.phred_quality_total();
total_len += rec.len();
info.matched_read_count += 1;
}
Err(e) => {
@@ -152,7 +154,7 @@ fn iter_lines_with_regex<W: Write>(

wtr.flush()?;

info.avg_qual = total_quality / (info.matched_read_count as f64);
info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);

@@ -218,7 +220,7 @@ fn iter_lines_with_cluster_file<W: Write>(
let mut fastq_reader = needletail::parser::FastqReader::new(reader);

// we store the total quality and length so that we can take an average at the end
let mut total_quality = 0f64;
let mut total_quality = 0u32;
let mut total_len = 0;

while let Some(rec) = fastq_reader.next() {
@@ -229,29 +231,31 @@ fn iter_lines_with_cluster_file<W: Write>(
info!("Processed: {}", info.read_count);
}

let rec = rec.expect("Invalid record");
let id = std::str::from_utf8(rec.id()).context("Could not convert id to string")?;
let position = rec.position().byte() as usize;
let sequence_rec = rec.expect("Invalid record");
let position = sequence_rec.position().byte() as usize;
let file_len = sequence_rec.all().len() + 1;
let mut rec = Record::try_from(sequence_rec)?;

let Some(identifier) = cluster_map.get(id) else {
let Some(identifier) = cluster_map.get(&rec.id) else {
if !skip_invalid_ids {
bail!(RowNotInClusters {
header: id.to_string()
})
bail!(RowNotInClusters { header: rec.id })
}
info.unmatched_read_count += 1;
continue;
};
info.matched_read_count += 1;

total_quality += write_read(wtr, &rec, identifier.clone(), position)?;
total_len += rec.num_bases();
rec.id = identifier.clone();
rec.write_index(wtr, position, file_len)?;

total_quality += rec.phred_quality_total();
total_len += rec.len();
}

wtr.flush()?;

// compute summary statistics
info.avg_qual = total_quality / (info.matched_read_count as f64);
info.avg_qual = (total_quality as f64) / (info.matched_read_count as f64);
info.avg_len = (total_len as f64) / (info.matched_read_count as f64);
info.gb = (fastq_reader.position().byte() as f64) / (1024u32.pow(3) as f64);

77 changes: 74 additions & 3 deletions src/io.rs
Original file line number Diff line number Diff line change
@@ -4,8 +4,11 @@ use needletail::parser::SequenceRecord;
use needletail::{parser::FastqReader, FastxReader};
use std::fmt::Write as FmtWrite;
// needed for write! to be implemented on Strings
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
use std::iter::Map;
use std::slice::Iter;

#[derive(PartialEq, Eq)]
pub enum ReadType {
@@ -25,6 +28,7 @@ pub struct Record {
impl TryFrom<SequenceRecord<'_>> for Record {
type Error = std::string::FromUtf8Error;

/// Attempt to create a Record from a SequenceRecord.
fn try_from(rec: SequenceRecord) -> Result<Self, Self::Error> {
Ok(Record {
id: String::from_utf8(rec.id().to_vec())?,
@@ -35,10 +39,40 @@ impl TryFrom<SequenceRecord<'_>> for Record {
}

impl Record {
/// Returns the PHRED quality scores of the record as a byte slice.
pub fn phred_quality(&self) -> Map<Iter<u8>, fn(&u8) -> u32> {
// we transform the quality to a PHRED score (ASCII ! to I)
// https://en.wikipedia.org/wiki/Phred_quality_score
self.qual
.as_bytes()
.into_iter()
.map(|&x| (x as u32) - 33u32)
}

/// Returns the average PHRED quality score of the record
pub fn phred_quality_avg(&self) -> f64 {
let qual = (self.phred_quality_total() as f64) / (self.len() as f64);
// round to 2dp
const ROUND_PRECISION: f64 = 100.0;
(qual * ROUND_PRECISION).round() / ROUND_PRECISION
}

/// Returns the sum of the PHRED quality scores of the record
pub fn phred_quality_total(&self) -> u32 {
self.phred_quality().sum()
}

/// Returns the sequence length in base count of the record
pub fn len(&self) -> usize {
self.seq.len()
}

/// Write the Record in a .fastq format
pub fn write_fastq(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
write!(writer, "@{}\n{}\n+\n{}", self.id, self.seq, self.qual)
}

/// Write the Record in a .fasta format
pub fn write_fasta(&self, writer: &mut impl Write) -> Result<(), std::io::Error> {
write!(writer, ">{}\n{}", self.id, self.seq)
}
@@ -52,6 +86,9 @@ impl Record {
/// * `group_idx` - The index of the read in the group.
/// * `group_size` - The size of the group.
/// * `avg_qual` - The average quality score of the group.
///
/// # Note
/// This function will modify the Record irreversibly by changing the Record's `id` field
pub fn add_metadata(
&mut self,
umi_group: usize,
@@ -80,6 +117,40 @@ impl Record {
}
}

#[derive(Debug, Serialize, Deserialize)]
struct IndexRecord {
id: String,
pos: usize,
avg_qual: f64,
n_bases: usize,
rec_len: usize,
}

impl Record {
/// Writes information about the Record to an external index writer, provided with
/// extra information.
///
/// # Arguments
///
/// * `wtr` - A mutable reference to a CSV writer.
/// * `pos` - The position of the record in the file.
/// * `file_len` - The bytes consumed by the record in the file (the _length_ on _file_)
pub fn write_index<W: Write>(
&self,
wtr: &mut csv::Writer<W>,
pos: usize,
file_len: usize,
) -> csv::Result<()> {
wtr.serialize(IndexRecord {
id: self.id.clone(),
pos,
avg_qual: self.phred_quality_avg(),
n_bases: self.len(),
rec_len: file_len,
})
}
}

pub struct UMIGroup {
/// The "Identifier" of this group, typically a "BC_UMI" string
pub id: RecordIdentifier,
@@ -114,7 +185,7 @@ pub struct UMIGroup {
/// let record = get_read_at_position(&mut file, 12345)?;
/// println!("Record ID: {}", record.id);
/// ```
pub fn get_read_at_position<R: Read + Seek + Send>(
pub fn get_record_from_position<R: Read + Seek + Send>(
// file: &mut File,
reader: &mut R,
// file_sequential: &mut dyn Read,
@@ -245,9 +316,9 @@ where
}

let read = if single {
get_read_at_position(reader_seq, pos)
get_record_from_position(reader_seq, pos)
} else {
get_read_at_position(reader_rnd, pos)
get_record_from_position(reader_rnd, pos)
};

match read {
10 changes: 5 additions & 5 deletions tests/correct/index.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2024-12-11T14:06:55.279837+08:00","elapsed":0.276829667,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":21.152054726719978,"avg_len":1030.5793678851728}
#{"nailpolish_version":"0.1.0","file_path":"/Users/olliecheng/Developer/WEHI/consensus-call/nailpolish/tests/data/scmixology2_sample.fastq","index_date":"2025-01-01T21:46:26.373596+08:00","elapsed":0.021128792,"gb":0.02817634679377079,"matched_read_count":14143,"unmatched_read_count":0,"read_count":14143,"avg_qual":22967.658700417167,"avg_len":1030.5793678851728}
id pos avg_qual n_bases rec_len
TCTGGCTCATTCTCCG_GCAGCGAAGCCC 0 19.47 593 1264
GCACTAAGTTGAAGTA_ATCAATGTATTG 1264 9.52 1655 3388
@@ -6704,7 +6704,7 @@ TGACGCGAGTCTACCA_CTCCCCCTTCCG 14207110 24.26 1506 3090
GATTCGATCAAGCCTA_AAAATCGAGCAG 14210200 24.36 512 1102
AAGTCGTCAGTAGGAC_TTGACCCGGTAG 14211302 21.78 890 1858
ATACTTCTCCCGAGTG_AGGTTCGCTCCT 14213160 25.64 400 878
CAGGTATCATGGGTTT_TTCCAAAGTTTT 14214038 21.17 1840 3758
CAGGTATCATGGGTTT_TTCCAAAGTTTT 14214038 21.18 1840 3758
CACGGGTAGGCCACTC_TAACTTTCCCGT 14217796 21.23 1112 2302
TCAGTTTGTAGCGCTC_GGTTAACTTTAT 14220098 17.68 677 1432
TGACGCGAGTCTACCA_AGTCTCAACTCC 14221530 22.58 779 1636
@@ -7466,7 +7466,7 @@ AGGGAGTCAGGTCAAG_GGAAGTTTTCGT 15815868 21.98 631 1340
TCCATGCGTTGCTCGG_TAAACTGGTGAG 15817208 22.35 1023 2124
TTACCATCATGTCAGT_TACTGTTTTGTG 15819332 22.25 590 1258
CACGGGTAGGCCACTC_ATGTTACCCACT 15820590 22.64 1180 2438
TTCCTTCAGGGTACGT_TTGCAAATGCGC 15823028 22.67 1080 2238
TTCCTTCAGGGTACGT_TTGCAAATGCGC 15823028 22.68 1080 2238
TGATTCTTCGTGTCAA_TTACCTTAAGGG 15825266 16.98 1274 2626
CGGCAGTTCTGGGCGT_TGTATATTATAA 15827892 19.7 1268 2614
AATCGACAGGGTGGGA_GGATTCTCAATA 15830506 18.33 414 906
@@ -12934,7 +12934,7 @@ AGACCCGAGTTGGACG_ACAACGGTTTTC 27841400 22.25 1341 2760
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27844160 19.32 735 1548
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27845708 21.04 689 1456
CTAAGTGAGAACAGGA_CTCCCAGCTGTC 27847164 22.91 742 1562
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27848726 14.92 560 1198
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27848726 14.93 560 1198
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27849924 20.11 616 1310
TTGAGTGCAGGGCTTC_TGCCACTATGTA 27851234 14.61 575 1228
CAGTTCCCAGAGTCTT_TGCGGTTCCGTT 27852462 19.5 328 734
@@ -13476,7 +13476,7 @@ AGCGCCACAATCCAGT_TCAGTGCCCTTT 28919244 21.01 656 1390
AGCGCCACAATCCAGT_TCAGTGCCCTTT 28920634 20.56 666 1410
CTAAGTGAGAACAGGA_TCTACATAACCA 28922044 27.7 928 1934
CTAAGTGAGAACAGGA_TCTACATAACCA 28923978 16.09 918 1914
CTAAGTGAGAACAGGA_TCTACATAACCA 28925892 23.42 480 1038
CTAAGTGAGAACAGGA_TCTACATAACCA 28925892 23.43 480 1038
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28926930 20.95 1944 3966
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28930896 22.7 780 1638
CAGCACGTCTCTAAGG_TTCATAGGCTTT 28932534 20.17 773 1624

0 comments on commit 425739f

Please sign in to comment.