Skip to content

Commit

Permalink
check chromosome length
Browse files Browse the repository at this point in the history
  • Loading branch information
brentp committed Jul 21, 2023
1 parent 37fc7f8 commit e27d862
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 57 deletions.
5 changes: 3 additions & 2 deletions benches/random_intervals.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use bedder::chrom_ordering::parse_genome;
use bedder::intersection::IntersectionIterator;
use bedder::interval::Interval;
use bedder::position::{Position, Positioned, PositionedIterator};
use bedder::string::String;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use hashbrown::HashMap;
use rand::Rng;
use std::io;

Expand Down Expand Up @@ -57,7 +57,8 @@ impl PositionedIterator for Intervals {
const MAX_POSITION: u64 = 10_000;

pub fn intersection_benchmark(c: &mut Criterion) {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let genome_str = "chr1\nchr2\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();

c.bench_function("simple intersection", |b| {
b.iter_with_large_drop(|| {
Expand Down
2 changes: 1 addition & 1 deletion examples/intersect_bed_count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::path::PathBuf;
use bedder::sniff;
use clap::Parser;
extern crate bedder;
use crate::bedder::genome_file::parse_genome;
use crate::bedder::chrom_ordering::parse_genome;
use crate::bedder::intersection::IntersectionIterator;
use crate::bedder::position::Positioned;

Expand Down
17 changes: 15 additions & 2 deletions src/bedder_bed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ where
#[cfg(test)]
mod tests {
use super::*;
use crate::chrom_ordering::Chromosome;
use crate::intersection::IntersectionIterator;
use hashbrown::HashMap;
use std::io::Cursor;
Expand All @@ -136,8 +137,20 @@ mod tests {
let br = BedderBed::new(Cursor::new("chr1\t21\t30\nchr1\t22\t33"));

let chrom_order = HashMap::from([
(String::from("chr1"), 0usize),
(String::from("chr2"), 1usize),
(
String::from("chr1"),
Chromosome {
index: 0usize,
length: None,
},
),
(
String::from("chr2"),
Chromosome {
index: 1usize,
length: None,
},
),
]);

let it = IntersectionIterator::new(Box::new(ar), vec![Box::new(br)], &chrom_order)
Expand Down
91 changes: 91 additions & 0 deletions src/chrom_ordering.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use crate::string::String;
use hashbrown::HashMap;
use std::io::{self, BufRead, Read};

#[derive(Debug, Ord, PartialEq, Eq, PartialOrd, Clone, Copy)]
pub struct Chromosome {
pub(crate) index: usize,
pub(crate) length: Option<usize>,
}
/// A genome is a map from chromosome name to index with an optional chromosome length.
pub fn parse_genome<R>(reader: R) -> io::Result<HashMap<String, Chromosome>>
where
R: Read,
{
let mut reader = io::BufReader::new(reader);
let mut genome = HashMap::default();
let mut line = std::string::String::new();
while reader.read_line(&mut line)? > 0 {
if line.trim().is_empty() || line.starts_with('#') {
line.clear();
continue;
}
let mut fields = line.split_whitespace();
match fields.next() {
Some(chrom) => {
let length = fields.next().map(|s| s.parse::<usize>());
let l = length.and_then(|c| match c {
Ok(l) => Some(l),
Err(_) => {
log::warn!(
"invalid length for chromosome {} with line: {}",
chrom,
line
);
None
}
});
genome.insert(
String::from(chrom),
Chromosome {
index: genome.len(),
length: l,
},
);
}
None => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("invalid genome file line: {}", line),
))
}
}
//.expect("require at least one column in genome file");
line.clear();
}
Ok(genome)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_parse_genome() {
let genome_str = "chr1\nchr2\t43\nchr3\n";
let genome = parse_genome(genome_str.as_bytes()).unwrap();
assert_eq!(genome.len(), 3);
assert_eq!(
genome.get("chr1"),
Some(&Chromosome {
index: 0,
length: None
})
);
assert_eq!(
genome.get("chr2"),
Some(&Chromosome {
index: 1,
length: Some(43)
})
);
assert_eq!(
genome.get("chr3"),
Some(&Chromosome {
index: 2,
length: None
})
);
}
}
40 changes: 0 additions & 40 deletions src/genome_file.rs

This file was deleted.

87 changes: 77 additions & 10 deletions src/intersection.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::chrom_ordering::Chromosome;
use crate::string::String;
use hashbrown::HashMap;
use std::cmp::Ordering;
Expand All @@ -14,7 +15,7 @@ pub struct IntersectionIterator<'a> {
base_iterator: Box<dyn PositionedIterator>,
other_iterators: Vec<Box<dyn PositionedIterator>>,
min_heap: BinaryHeap<ReverseOrderPosition<'a, Position>>,
chromosome_order: &'a HashMap<String, usize>,
chromosome_order: &'a HashMap<String, Chromosome>,
// because multiple intervals from each stream can overlap a single base interval
// and each interval from others may overlap many base intervals, we must keep a cache (Q)
// we always add intervals in order with push_back and therefore remove with pop_front.
Expand Down Expand Up @@ -51,11 +52,12 @@ pub struct Intersections<P: Positioned> {

struct ReverseOrderPosition<'a, P: Positioned> {
position: P,
chromosome_order: &'a HashMap<String, usize>,
chromosome_order: &'a HashMap<String, Chromosome>,
id: usize, // file_index
}

impl<'a, P: Positioned> PartialEq for ReverseOrderPosition<'a, P> {
#[inline]
fn eq(&self, other: &Self) -> bool {
self.position.start() == other.position.start()
&& self.position.stop() == other.position.stop()
Expand All @@ -66,12 +68,14 @@ impl<'a, P: Positioned> PartialEq for ReverseOrderPosition<'a, P> {
impl<'a, P: Positioned> Eq for ReverseOrderPosition<'a, P> {}

impl<'a, P: Positioned> PartialOrd for ReverseOrderPosition<'a, P> {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}

impl<'a, P: Positioned> Ord for ReverseOrderPosition<'a, P> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
if self.position.chrom() != other.position.chrom() {
return self
Expand Down Expand Up @@ -101,10 +105,12 @@ impl<'a, P: Positioned> Ord for ReverseOrderPosition<'a, P> {
fn cmp(
a: &dyn Positioned,
b: &dyn Positioned,
chromosome_order: &HashMap<String, usize>,
chromosome_order: &HashMap<String, Chromosome>,
) -> Ordering {
if a.chrom() != b.chrom() {
return chromosome_order[a.chrom()].cmp(&chromosome_order[b.chrom()]);
return chromosome_order[a.chrom()]
.index
.cmp(&chromosome_order[b.chrom()].index);
}
// same chrom.
if a.stop() <= b.start() {
Expand Down Expand Up @@ -133,6 +139,17 @@ impl<'a> Iterator for IntersectionIterator<'a> {
Err(e) => return Some(Err(e)),
Ok(p) => Rc::new(p),
};
if let Some(chrom) = self.chromosome_order.get(base_interval.chrom()) {
if let Some(chrom_len) = chrom.length {
if base_interval.stop() > chrom_len as u64 {
let msg = format!(
"interval beyond end of chromosome: {}",
region_str(base_interval.as_ref())
);
return Some(Err(Error::new(ErrorKind::Other, msg)));
}
}
}

if self.out_of_order(base_interval.clone()) {
let p = self
Expand Down Expand Up @@ -193,7 +210,7 @@ impl<'a> IntersectionIterator<'a> {
pub fn new(
base_iterator: Box<dyn PositionedIterator>,
other_iterators: Vec<Box<dyn PositionedIterator>>,
chromosome_order: &'a HashMap<String, usize>,
chromosome_order: &'a HashMap<String, Chromosome>,
) -> io::Result<Self> {
let min_heap = BinaryHeap::new();
let called = vec![false; other_iterators.len()];
Expand Down Expand Up @@ -335,6 +352,7 @@ impl<'a> IntersectionIterator<'a> {
#[cfg(test)]
mod tests {
use super::*;
use crate::chrom_ordering::parse_genome;
use crate::interval::Interval;

struct Intervals {
Expand Down Expand Up @@ -374,7 +392,22 @@ mod tests {

#[test]
fn many_intervals() {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let chrom_order = HashMap::from([
(
String::from("chr1"),
Chromosome {
index: 0,
length: None,
},
),
(
String::from("chr2"),
Chromosome {
index: 1,
length: None,
},
),
]);
let mut a_ivs = Intervals::new(String::from("A"), Vec::new());
let mut b_ivs = Intervals::new(String::from("B"), Vec::new());
let n_intervals = 100;
Expand Down Expand Up @@ -419,7 +452,8 @@ mod tests {

#[test]
fn bookend_and_chrom() {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let genome_str = "chr1\nchr2\nchr3\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
let chrom = String::from("chr1");
let a_ivs = Intervals::new(
String::from("A"),
Expand Down Expand Up @@ -483,9 +517,40 @@ mod tests {
})
}

#[test]
fn interval_beyond_end_of_chrom() {
let genome_str = "chr1\t22\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
let a_ivs = Intervals::new(
String::from("A"),
vec![
Interval {
chrom: String::from("chr1"),
start: 10,
stop: 22,
..Default::default()
},
Interval {
chrom: String::from("chr1"),
start: 1,
stop: 23,
..Default::default()
},
],
);
let mut iter = IntersectionIterator::new(Box::new(a_ivs), vec![], &chrom_order)
.expect("error getting iterator");

let e = iter.nth(1).expect("error getting next");
assert!(e.is_err());
let e = e.err().unwrap();
assert!(e.to_string().contains("beyond end of chromosome"));
}

#[test]
fn ordering_error() {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let genome_str = "chr1\nchr2\nchr3\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
let a_ivs = Intervals::new(
String::from("A"),
vec![
Expand Down Expand Up @@ -559,7 +624,8 @@ mod tests {

#[test]
fn multiple_sources() {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let genome_str = "chr1\nchr2\nchr3\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
let a_ivs = Intervals::new(
String::from("A"),
vec![Interval {
Expand Down Expand Up @@ -612,7 +678,8 @@ mod tests {
#[test]
#[ignore]
fn zero_length() {
let chrom_order = HashMap::from([(String::from("chr1"), 0), (String::from("chr2"), 1)]);
let genome_str = "chr1\nchr2\nchr3\n";
let chrom_order = parse_genome(genome_str.as_bytes()).unwrap();
let a_ivs = Intervals::new(
String::from("A"),
vec![Interval {
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub mod string;

pub mod sniff;

pub mod genome_file;
pub mod chrom_ordering;

#[cfg(feature = "bed")]
/// Bed parser implementing the PositionedIterator trait.
Expand Down
Loading

0 comments on commit e27d862

Please sign in to comment.