Skip to content

Commit

Permalink
add concat functionality with tests
Browse files Browse the repository at this point in the history
  • Loading branch information
Johanna committed Aug 6, 2024
1 parent 4cf4faf commit a3b2f4d
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 30 deletions.
4 changes: 2 additions & 2 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ pub enum Commands {
#[arg(required = true, short)]
output: String,
},
/// Concat one sketch file (.skm and .skd pair) with new genomes
/// Concat one sketch file (.skm and .skd pair) with new genomes (seq_files or files_list)
Concat {
/// The first .skd (sketch data) file
#[arg(required = true)]
Expand Down Expand Up @@ -200,7 +200,7 @@ pub enum Commands {
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,

/// aaHash 'level'
/// aaHash 'level'
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
level: AaLevel,
},
Expand Down
47 changes: 28 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ pub mod bloom_filter;
pub mod hashing;

pub mod utils;
use std::fs::{File, OpenOptions};
use std::io::copy;

/// Default k-mer size for (genome) sketching
pub const DEFAULT_KMER: usize = 17;
Expand Down Expand Up @@ -407,7 +409,6 @@ pub fn main() -> Result<(), Error> {
threads,
level,
} => {

//get input files
log::info!("Getting input files");
let input_files: Vec<(String, String, Option<String>)> =
Expand All @@ -419,25 +420,23 @@ pub fn main() -> Result<(), Error> {
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", db));
println!("{:?}", db_metadata);

println!("{:?}", db_metadata.kmer_lengths());
db_metadata.concat_competibility(&input_files);
if !db_metadata.concat_competibility(&input_files) {
panic!("Databases are not compatible for merging.")
}
log::info!("Passed concat check");

// read out sketching information needed to sketch the new files
let kmers = db_metadata.kmer_lengths();
// Build, merge
let rc = !*single_strand;
// Set expected sketchsize
let sketch_size = db_metadata.sketch_size;
// Set aa level
let seq_type = db_metadata.get_hash_type();

if *concat_fasta && matches!(*seq_type, HashType::DNA | HashType::PDB) {
panic!("--concat-fasta currently only supported with --seq-type aa");
}

log::info!(
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
kmers,
&kmers,
sketch_size * u64::BITS as u64,
seq_type,
threads,
Expand All @@ -448,7 +447,7 @@ pub fn main() -> Result<(), Error> {
} else {
seq_type.clone()
};
// sketch freshly incoming files
// sketch genomes and save them to concat output file
let mut db2_sketches = sketch_files(
output,
&input_files,
Expand All @@ -460,17 +459,27 @@ pub fn main() -> Result<(), Error> {
*min_count,
*min_qual,
);
let db2_metadata = MultiSketch::new(&mut db2_sketches, sketch_size, &kmers, seq_type);
db2_metadata
.save_metadata(output)
.expect("Error saving metadata");

// // save skd data from db1 and from freshly sketched input files
// log::info!("Merging and saving sketch data to {}.skd", output);
// utils::save_sketch_data(db_metadata, db2, output);
let mut db2_metadata =
MultiSketch::new(&mut db2_sketches, sketch_size, kmers, seq_type);

// // read in skm from db1
// // merge and update skm from db1 and the new just sketched sketch
// save skd data from db1 and from freshly sketched input files
log::info!("Merging and saving sketch data to {}.skd", output);
// let mut output_file = File::create(format!("{}.skd", output))?;
let mut output_file = OpenOptions::new()
.create(true)
.append(true)
.open(format!("{}.skd", output))?;
// stream sketch data directly to concat output file
let mut db_sketch = File::open(format!("{}.skd", db))?;
println!("{:?}", db_sketch);
println!("{:?}", output_file);
copy(&mut db_sketch, &mut output_file)?;

// merge and update skm from db1 and the new just sketched sketch
let concat_metadata = db2_metadata.merge_sketches(&db_metadata);
concat_metadata
.save_metadata(output)
.unwrap_or_else(|_| panic!("Couldn't save metadata to {}", output));
Ok(())
}

Expand Down
9 changes: 4 additions & 5 deletions src/multisketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,15 +158,14 @@ impl MultiSketch {
&& self.get_hash_type() == sketch2.get_hash_type()
}

pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) {
pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) -> bool {
for (id, _, _) in name_vec.iter() {
if self.name_map.contains_key(id) {
panic!(
"{} appears in both the database and the provided files. Cannot concat files.",
id
);
println!("{} is found in both database and input fasta", id);
return false;
}
}
true
}

pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {
Expand Down
148 changes: 148 additions & 0 deletions tests/concat.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
use predicates::prelude::*;
use snapbox::cmd::{cargo_bin, Command};
use std::path::Path;

pub mod common;
use crate::common::*;

#[path = "../src/io.rs"]
pub mod io;

use sketchlib::multisketch::MultiSketch;

#[cfg(test)]

mod tests {
use super::*;

#[test]
fn concat_competibility_test() {
let sandbox = TestSetup::setup();
let ref_db1 = sandbox.file_string("sketches1", TestDir::Input);
let ref_db2 = sandbox.file_string("sketches2", TestDir::Input);
let file_list_name = sandbox.file_string("fasta.txt", TestDir::Input);
let non_seq_files: Option<Vec<String>> = None;


let file_list: Option<String> = Some(file_list_name);
//get input files with file_list
log::info!("Getting input files for test");
let input_file_list = io::get_input_list(&file_list, &non_seq_files);
log::info!("Parsed {} samples in input list", input_file_list.len());

//check if any of the new files are already existant in the db
let db1_metadata: MultiSketch = MultiSketch::load(&ref_db1)
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db1));
println!("{:?}", db1_metadata);

//check if any of the new files are already existant in the db
let db2_metadata: MultiSketch = MultiSketch::load(&ref_db2)
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db2));
println!("{:?}", db2_metadata);

// Test case 1:
assert!(
!db1_metadata.concat_competibility(&input_file_list),
"Sketches should not be compatible"
);
}

#[test]
fn test_concat_sketches() {
let sandbox = TestSetup::setup();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "part1"])
.assert()
.success();
log::info!("Part 1 Sketched");

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "part2"])
.assert()
.success();
log::info!("Part 2 Sketched");

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("sketch")
.args(&["--k-vals", "17"])
.arg("--seq-files")
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_ref"])
.assert()
.success();
log::info!("concat_ref Sketched");

// Overlapping labels fails
Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("concat")
.arg("part1")
.arg("--seq-files")
.arg(sandbox.file_string("14412_3#82.contigs_velvet.fa.gz", TestDir::Input))
.arg(sandbox.file_string("14412_3#84.contigs_velvet.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_test"])
.assert()
.failure();

Command::new(cargo_bin("sketchlib"))
.current_dir(sandbox.get_wd())
.arg("concat")
.arg("part1")
.arg("--seq-files")
// .arg(sandbox.file_string("fasta_part2.txt", TestDir::Input))
.arg(sandbox.file_string("R6.fa.gz", TestDir::Input))
.arg(sandbox.file_string("TIGR4.fa.gz", TestDir::Input))
.arg("-v")
.args(&["-o", "concat_test"])
.assert()
.success();
log::info!("concat_test Sketched");

// Check .skm the same
let concat_sketch: MultiSketch =
MultiSketch::load(&sandbox.file_string("concat_test", TestDir::Output))
.expect("Failed to load output merged sketch");
let expected_sketch =
MultiSketch::load(&sandbox.file_string("concat_ref", TestDir::Output))
.expect("Failed to load expected merged sketch");
println!("{}", concat_sketch);
println!("{}", expected_sketch);
assert_eq!(
concat_sketch, expected_sketch,
"Concat sketch metadata does not match"
);

// Check .skd the same
let predicate_file = predicate::path::eq_file(Path::new(
&sandbox.file_string("concat_test.skd", TestDir::Output),
));
assert_eq!(
true,
predicate_file.eval(Path::new(
&sandbox.file_string("concat_ref.skd", TestDir::Output)
)),
"Concat sketch data does not match"
);
}
}
8 changes: 4 additions & 4 deletions tests/test_files_in/fasta.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa
R6.fa tests/test_files_in/R6.fa
TIGR4.fa tests/test_files_in/TIGR4.fa
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa.gz
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa.gz
R6.fa tests/test_files_in/R6.fa.gz
TIGR4.fa tests/test_files_in/TIGR4.fa.gz
2 changes: 2 additions & 0 deletions tests/test_files_in/fasta_part1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
14412_3#82.contigs_velvet.fa tests/test_files_in/14412_3#82.contigs_velvet.fa.gz
14412_3#84.contigs_velvet.fa tests/test_files_in/14412_3#84.contigs_velvet.fa.gz
2 changes: 2 additions & 0 deletions tests/test_files_in/fasta_part2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
R6.fa /Users/wachsmannj/Documents/work/development/sketchlib.rust/tests/test_files_in/R6.fa.gz
TIGR4.fa /Users/wachsmannj/Documents/work/development/sketchlib.rust/tests/test_files_in/TIGR4.fa.gz

0 comments on commit a3b2f4d

Please sign in to comment.