Skip to content

Commit

Permalink
add concat functionality (no tests yet)
Browse files Browse the repository at this point in the history
  • Loading branch information
Johanna committed Aug 5, 2024
1 parent 35e4258 commit 4cf4faf
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 2 deletions.
43 changes: 43 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,49 @@ pub enum Commands {
#[arg(required = true, short)]
output: String,
},
/// Concat one sketch file (.skm and .skd pair) with new genomes
Concat {
/// The first .skd (sketch data) file
#[arg(required = true)]
db: String,

/// List of input FASTA files
#[arg(long, group = "input", num_args = 1.., value_delimiter = ',')]
seq_files: Option<Vec<String>>,

/// File listing input files (tab separated name, sequences)
#[arg(short, group = "input")]
file_list: Option<String>,

/// Output filename for the merged sketch
#[arg(required = true, short)]
output: String,

/// Ignore reverse complement (all contigs are oriented along same strand)
#[arg(long, default_value_t = DEFAULT_STRAND)]
single_strand: bool,

/// Minimum k-mer count (with reads)
#[arg(long, default_value_t = DEFAULT_MINCOUNT)]
min_count: u16,

/// Minimum k-mer quality (with reads)
#[arg(long, default_value_t = DEFAULT_MINQUAL)]
min_qual: u8,

/// Treat every sequence in an input file as a new sample (aa only)
// TODO: for now, could be extended to dna, but probably no need
#[arg(long, default_value_t = false)]
concat_fasta: bool,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,

/// aaHash 'level'
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
level: AaLevel,
},
/// Print information about a .skm file
Info {
/// Sketch metadata file (.skm) to describe
Expand Down
79 changes: 79 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,85 @@ pub fn main() -> Result<(), Error> {
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
}

Commands::Concat {
db,
seq_files,
file_list,
output,
single_strand,
min_count,
min_qual,
concat_fasta,
threads,
level,
} => {

//get input files
log::info!("Getting input files");
let input_files: Vec<(String, String, Option<String>)> =
get_input_list(file_list, seq_files);
log::info!("Parsed {} samples in input list", input_files.len());

//check if any of the new files are already existant in the db
let db_metadata: MultiSketch = MultiSketch::load(db)
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", db));
println!("{:?}", db_metadata);

println!("{:?}", db_metadata.kmer_lengths());
db_metadata.concat_competibility(&input_files);
log::info!("Passed concat check");

// read out sketching information needed to sketch the new files
let kmers = db_metadata.kmer_lengths();
// Build, merge
let rc = !*single_strand;
// Set expected sketchsize
let sketch_size = db_metadata.sketch_size;
// Set aa level
let seq_type = db_metadata.get_hash_type();

if *concat_fasta && matches!(*seq_type, HashType::DNA | HashType::PDB) {
panic!("--concat-fasta currently only supported with --seq-type aa");
}
log::info!(
"Running sketching: k:{:?}; sketch_size:{}; seq:{:?}; threads:{}",
kmers,
sketch_size * u64::BITS as u64,
seq_type,
threads,
);

let seq_type = if let HashType::AA(_) = seq_type {
HashType::AA(level.clone())
} else {
seq_type.clone()
};
// sketch freshly incoming files
let mut db2_sketches = sketch_files(
output,
&input_files,
*concat_fasta,
&kmers,

Check warning on line 456 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/lib.rs:456:17 | 456 | &kmers, | ^^^^^^ help: change this to: `kmers` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow = note: `#[warn(clippy::needless_borrow)]` on by default
sketch_size,
&seq_type,
rc,
*min_count,
*min_qual,
);
let db2_metadata = MultiSketch::new(&mut db2_sketches, sketch_size, &kmers, seq_type);

Check warning on line 463 in src/lib.rs

View workflow job for this annotation

GitHub Actions / clippy

this expression creates a reference which is immediately dereferenced by the compiler

warning: this expression creates a reference which is immediately dereferenced by the compiler --> src/lib.rs:463:81 | 463 | let db2_metadata = MultiSketch::new(&mut db2_sketches, sketch_size, &kmers, seq_type); | ^^^^^^ help: change this to: `kmers` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_borrow
db2_metadata
.save_metadata(output)
.expect("Error saving metadata");

// // save skd data from db1 and from freshly sketched input files
// log::info!("Merging and saving sketch data to {}.skd", output);
// utils::save_sketch_data(db_metadata, db2, output);

// // read in skm from db1
// // merge and update skm from db1 and the new just sketched sketch
Ok(())
}

Commands::Info {
skm_file,
sample_info,
Expand Down
11 changes: 11 additions & 0 deletions src/multisketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,17 @@ impl MultiSketch {
&& self.get_hash_type() == sketch2.get_hash_type()
}

pub fn concat_competibility(&self, name_vec: &[(String, String, Option<String>)]) {
for (id, _, _) in name_vec.iter() {
if self.name_map.contains_key(id) {
panic!(
"{} appears in both the database and the provided files. Cannot concat files.",
id
);
}
}
}

pub fn merge_sketches(&mut self, sketch2: &Self) -> &mut Self {
// First metadata
let offset = self.sketch_metadata.len();
Expand Down
4 changes: 2 additions & 2 deletions tests/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ mod tests {

// Check .skm the same
let merged_sketch: MultiSketch =
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
.expect("Failed to load output merged sketch");
MultiSketch::load(&sandbox.file_string("merged_test", TestDir::Output))
.expect("Failed to load output merged sketch");
let expected_sketch =
MultiSketch::load(&sandbox.file_string("merged_ref", TestDir::Output))
.expect("Failed to load expected merged sketch");
Expand Down

0 comments on commit 4cf4faf

Please sign in to comment.