Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
38 committed Sep 19, 2020
1 parent f1603e3 commit 994e1f6
Show file tree
Hide file tree
Showing 14 changed files with 141 additions and 56 deletions.
20 changes: 20 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
The MIT License (MIT)

Copyright (c) 2020 Hao Hou<[email protected]>

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
4 changes: 4 additions & 0 deletions d4/src/d4file/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ impl D4FileBuilder {
self
}

pub fn dictionary(&self) -> &Dictionary {
&self.dict
}

pub fn create<PT: PTableWriter, ST: STableWriter>(&mut self) -> Result<D4FileWriter<PT, ST>> {
let mut file = OpenOptions::new()
.create(true)
Expand Down
72 changes: 42 additions & 30 deletions d4/src/dict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ pub enum EncodeResult {
}

impl Dictionary {
pub fn pretty_print(&self) -> serde_json::Result<String> {
serde_json::to_string_pretty(self)
}
pub fn from_sample_bam<P: AsRef<Path>, F: Fn(&str, usize) -> bool>(
path: P,
filter: F,
Expand All @@ -51,61 +54,70 @@ impl Dictionary {
from += 100_000;
}
}
let mut histogram = vec![0; 65536];
let mut histogram = HashMap::new();
let mut range_count = HashMap::new();
let path = path.as_ref();
for values in parts

let part_results: Vec<_> = parts
.into_par_iter()
.map(|(chr, from, to)| {
let mut bam = BamFile::open(path).unwrap();
if let Some(reference) = reference {
bam.reference_path(reference);
}
let range = bam.range(chr, from, to).unwrap();
let mut histogram = vec![0usize; 65536];
let mut previous_value = None;
let mut histogram = HashMap::new();
let mut range_count = HashMap::new();
for (_, _, dep) in DepthIter::new(range) {
if dep < 65536 {
histogram[dep as usize] += 1;
match previous_value {
Some(d) if d == dep => continue,
Some(_) | None => {
previous_value = Some(dep);
*range_count.entry(dep).or_insert(0) += 1;
}
}
*histogram.entry(dep).or_insert(0) += 1;
}
histogram
(histogram, range_count)
})
.collect::<Vec<_>>()
{
for (idx, val) in values.into_iter().enumerate() {
histogram[idx] += val;
.collect();
for (part_hist, part_rc) in part_results {
for (k, v) in part_hist.into_iter() {
*histogram.entry(k).or_insert(0) += v;
}
for (k, v) in part_rc.into_iter() {
*range_count.entry(k).or_insert(0) += v;
}
}

let mut values: Vec<_> = (0..histogram.len()).collect();
values.sort_by_key(|idx| total_size - histogram[*idx]);

let total_sample: usize = histogram.iter().sum();
let mut histogram: Vec<_> = histogram.into_iter().collect();
histogram.sort_by_key(|(_, count)| total_size - count);
let total_intervals: usize = range_count.values().sum();

let best_bit_width = (0..=16)
.map(|b| {
let n_values = 1 << b;

let indexed_value: usize = values[..n_values].iter().map(|&x| histogram[x]).sum();

let ov_per_pos = (total_sample - indexed_value) as f64 / sample_size as f64;

let mut out_of_range_values = total_intervals;
for &(key, _) in histogram.iter().take(1 << b) {
out_of_range_values -= range_count[&key];
}
let p_size = total_size as f64 * b as f64 / 8.0;
let s_size = ov_per_pos * 8.0 * total_size as f64;

(b, (s_size + p_size).round() as usize)
let s_size =
(out_of_range_values as f64) * 4.0 / sample_size as f64 * total_size as f64;
(b, (p_size + s_size).round() as usize)
})
.min_by_key(|&(_, v)| v)
.min_by_key(|&(_, size)| size)
.unwrap();

let mut dict = vec![];
for i in 0..(1 << best_bit_width.0) {
dict.push(values[i as usize] as i32);
dict.push(histogram[i as usize].0 as i32);
}
if dict.len() > 1 {
let size = dict.len();
dict[..size - 1].sort();

let min = dict.iter().min().unwrap();
let max = dict.iter().max().unwrap();
if max - min + 1 == dict.len() as i32 && (dict.len() == 1 || *min != dict[0]) {
dict.sort();
}
println!("{:?}", dict);
Ok(Self::from_dict_list(dict)?)
}
pub fn from_dict_list(mapping: Vec<i32>) -> Result<Self> {
Expand Down
4 changes: 2 additions & 2 deletions d4/src/ptab/uncompressed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,8 @@ impl PTablePartitionWriter for PartialPrimaryTable<Writer> {
}
fn can_encode(&self, value: i32) -> bool {
match self.dictionary.encode_value(value) {
EncodeResult::DictionaryIndex(_) => { true },
_ => { false },
EncodeResult::DictionaryIndex(_) => true,
_ => false,
}
}
fn bit_width(&self) -> usize {
Expand Down
2 changes: 1 addition & 1 deletion d4/src/stab/simple_kv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ impl<R: Record> STableReader for SimpleKeyValueReader<R> {
_p: PhantomData,
})
}
// TODO : This function probably not working for the compressed mode

fn split(&mut self, partitions: &[(&str, u32, u32)]) -> Result<Vec<Self::Partition>> {
let root = self.s_table_root.clone();
let mut record_blocks = self.load_record_blocks()?;
Expand Down
20 changes: 15 additions & 5 deletions d4/src/stab/simple_kv/record.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ pub trait Record: Sized + Copy + Send + 'static {
/// it returns None. If this is impossible, it create a new record for the new value
fn encode(this: Option<&mut Self>, pos: u32, value: i32) -> Option<Self>;

fn encode_range<E>(left: u32, right: u32, value: i32, mut ops: impl FnMut(Self)->Result<(), E>) -> Result<(), E>{
let mut state = None;
fn encode_range<E>(
left: u32,
right: u32,
value: i32,
mut ops: impl FnMut(Self) -> Result<(), E>,
) -> Result<(), E> {
let mut state = None;
for record in (left..right).filter_map(move |pos| {
let next_state = Self::encode(state.as_mut(), pos, value);
if next_state.is_some() {
Expand Down Expand Up @@ -114,16 +119,21 @@ impl Record for RangeRecord {
value,
})
}

#[inline(always)]
fn encode_range<E>(mut left: u32, right: u32, value: i32, mut ops: impl FnMut(Self)->Result<(), E>) -> Result<(), E>{
fn encode_range<E>(
mut left: u32,
right: u32,
value: i32,
mut ops: impl FnMut(Self) -> Result<(), E>,
) -> Result<(), E> {
while left < right {
let size = (right - left).min(65536);
left += size;
ops(Self {
left: left + 1,
size_enc: (size - 1) as u16,
value
value,
})?;
}
Ok(())
Expand Down
3 changes: 2 additions & 1 deletion d4/src/stab/simple_kv/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ impl<R: Record> STablePartitionWriter for SimpleKeyValuePartialWriter<R> {
fn encode_record(&mut self, left: u32, right: u32, value: i32) -> Result<()> {
self.flush()?;
R::encode_range(left, right, value, |record| {
self.compression.append_record(Some(&record), &mut self.stream)
self.compression
.append_record(Some(&record), &mut self.stream)
})
}

Expand Down
2 changes: 1 addition & 1 deletion d4/src/task/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ where
{
idx += 1;
}
if file_partition[idx].0.region().0 > region.0.as_ref() {
if idx < file_partition.len() && file_partition[idx].0.region().0 > region.0.as_ref() {
continue;
}
if idx >= file_partition.len() {
Expand Down
2 changes: 1 addition & 1 deletion d4binding/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ pub fn d4_file_write_intervals(
sr.flush();
return ret;
}

-1
}

Expand Down
20 changes: 11 additions & 9 deletions d4binding/src/stream.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use d4::ptab::DecodeResult;
use d4::ptab::PTablePartitionReader;
use d4::ptab::{
UncompressedDecoder, UncompressedEncoder, UncompressedPartReader, UncompressedPartWriter,
UncompressedReader, UncompressedWriter, PTablePartitionWriter,
PTablePartitionWriter, UncompressedDecoder, UncompressedEncoder, UncompressedPartReader,
UncompressedPartWriter, UncompressedReader, UncompressedWriter,
};
use d4::stab::{
RangeRecord, RecordIterator, STablePartitionReader, STablePartitionWriter,
Expand Down Expand Up @@ -303,7 +303,7 @@ impl StreamWriter {
if self.current_part_id >= self.parts.len() {
return Ok(false);
}

let current_part = &mut self.parts[self.current_part_id];
let (chr, begin, end) = current_part.0.region();

Expand All @@ -314,26 +314,28 @@ impl StreamWriter {
if chr == self.current_chr && begin <= left && left < end {
break (current_part, end);
}

self.current_part_id += 1;
self.current_primary_encoder = None;
};

let actual_right = end.min(right);

let should_iterate = current_part.0.bit_width() != 0;

if self.current_primary_encoder.is_none() {
self.current_primary_encoder = Some(current_part.0.as_codec());
}
let stab = &mut current_part.1;

if should_iterate {
for pos in left..actual_right {
if !self.current_primary_encoder
.as_mut()
.unwrap()
.encode(pos as usize, value) {
if !self
.current_primary_encoder
.as_mut()
.unwrap()
.encode(pos as usize, value)
{
stab.encode(pos, value)?;
}
}
Expand Down
14 changes: 13 additions & 1 deletion d4utils/src/create/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,14 @@ fn main_impl<P: PTableWriter, S: STableWriter>(
let input_path: &Path = matches.value_of("input-file").unwrap().as_ref();
let ext = input_path.extension().unwrap();

let output_path = matches.value_of("output-file").unwrap();
let output_path = matches.value_of("output-file").map_or_else(
|| {
let mut ret = input_path.to_owned();
ret.set_extension("d4");
ret
},
|path| path.into(),
);
let mut d4_builder = d4::D4FileBuilder::new(output_path);

d4_builder.set_dictionary(make_dictionary(
Expand All @@ -113,6 +120,11 @@ fn main_impl<P: PTableWriter, S: STableWriter>(
}
}

if matches.values_of("dump-dict").is_some() {
println!("{}", d4_builder.dictionary().pretty_print()?);
std::process::exit(0);
}

if let Some(pattern) = matches
.value_of("filter")
.map(|pattern| regex::Regex::new(pattern).expect("Invalid filter regex"))
Expand Down
4 changes: 2 additions & 2 deletions d4utils/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let ret = match args.get(0).map(AsRef::as_ref) {
Some("create") => create::entry_point(args),
Some("framedump") => framedump::entry_point(args),
Some("show") => show::entry_point(args),
Some("show") | Some("view") => show::entry_point(args),
Some("stat") => stat::entry_point(args),
Some("plot") => plot::entry_point(args),
_ => {
Expand All @@ -23,7 +23,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
eprintln!("Possible subcommands are:");
eprintln!("\tcreate \tCreate a new D4 depth profile");
eprintln!("\tframedump\tDump The container data");
eprintln!("\tshow \tPrint the underlying depth profile");
eprintln!("\tview \tPrint the underlying depth profile");
eprintln!("\tstat \tRun statistics on the given file");
eprintln!("\tplot \tPlot the specified region");
Ok(())
Expand Down
6 changes: 3 additions & 3 deletions d4utils/src/show/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ use clap::{load_yaml, App};
use d4::ptab::{DecodeResult, PTablePartitionReader, UncompressedReader};
use d4::stab::{RangeRecord, STablePartitionReader, SimpleKeyValueReader};
use d4::D4FileReader;
use std::io::{Write, Result as IOResult};
use regex::Regex;
use std::io::{Result as IOResult, Write};
fn write_bed_record_fast<W: Write>(
mut writer: W,
chr: &str,
Expand Down Expand Up @@ -54,7 +54,7 @@ pub fn entry_point(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>>
let end: u32 = captures
.name("TO")
.map_or(!0u32, |x| x.as_str().parse().unwrap_or(!0));
return (chr.to_string(), start, end);
return (chr.to_string(), start - 1, end - 1);
}
panic!("Unexpected region specifier")
})
Expand All @@ -64,7 +64,7 @@ pub fn entry_point(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>>
.iter_mut()
.find(|((p, _), _)| p.region().0 == chr)
{
part.1.push((start, end));
part.1.push((start - 1, end - 1));
}
});
});
Expand Down
24 changes: 24 additions & 0 deletions d4utils/src/stat/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,27 @@ fn percentile_stat(matches: ArgMatches, percentile: f64) -> Result<(), Box<dyn s
Ok(())
}

fn hist_stat(matches: ArgMatches) -> Result<(), Box<dyn std::error::Error>> {
let histograms = run_task::<Histogram>(matches, 0..1000)?;
let mut hist_result = [0; 1001];
let (mut below, mut above) = (0, 0);
for (_, _, _, (b, hist, a)) in histograms {
below += b;
above += a;
for (id, val) in hist.iter().enumerate() {
hist_result[id + 1] += val;
}
}

println!("<0\t{}", below);
for (val, cnt) in hist_result[1..].into_iter().enumerate() {
println!("{}\t{}", val, cnt);
}
println!(">1000\t{}", above);

Ok(())
}

pub fn entry_point(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>> {
let yaml = load_yaml!("cli.yml");
let matches = App::from_yaml(yaml).get_matches_from(&args);
Expand All @@ -89,6 +110,9 @@ pub fn entry_point(args: Vec<String>) -> Result<(), Box<dyn std::error::Error>>
Some("median") => {
percentile_stat(matches, 0.5)?;
}
Some("hist") => {
hist_stat(matches)?;
}
Some(whatever) if whatever.starts_with("percentile=") => {
let prefix_len = "percentile=".len();
let percentile: f64 = whatever[prefix_len..].parse()?;
Expand Down

0 comments on commit 994e1f6

Please sign in to comment.