Skip to content

Commit

Permalink
Merge pull request #27 from bacpop/johanna_dev
Browse files Browse the repository at this point in the history
Add delete function and more tests
  • Loading branch information
johnlees authored Oct 3, 2024
2 parents fc12703 + 35db878 commit 38a2a28
Show file tree
Hide file tree
Showing 20 changed files with 630 additions and 25 deletions.
5 changes: 5 additions & 0 deletions .github/configs/grcov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
branch: false
ignore-not-existing: true
llvm: true
output-type: lcov
output-file: ./lcov.info
45 changes: 45 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Rust codecov

on:
push:
branches: [ '*' ] # Run on all branches
pull_request:
branches: [ '*' ] # Run on PRs to all branches

env:
CARGO_TERM_COLOR: always

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
components: llvm-tools-preview # Required for grcov

- name: Build
run: cargo build --verbose

- name: Run tests
run: cargo test --verbose --no-fail-fast
env:
CARGO_INCREMENTAL: '0'
RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'

- name: Run grcov
run: |
cargo install grcov
grcov . -s . --binary-path ./target/debug/ -t lcov --branch --ignore-not-existing --ignore "/*" -o lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: lcov.info
fail_ci_if_error: true
verbose: true
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
tests/files_out/*
tests/sketchlib_c/*

# venvs
3di_venv
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# sketchlib.rust

<!-- badges: start -->
[![Cargo Build & Test](https://github.com/bacpop/sketchlib.rust/actions/workflows/ci.yml/badge.svg)](https://github.com/bacpop/sketchlib.rust/actions/workflows/ci.yml)
[![Clippy check](https://github.com/bacpop/sketchlib.rust/actions/workflows/clippy.yml/badge.svg)](https://github.com/bacpop/sketchlib.rust/actions/workflows/clippy.yml)
[![codecov](https://codecov.io/gh/bacpop/sketchlib.rust/graph/badge.svg?token=IBYPTT4J3F)](https://codecov.io/gh/bacpop/sketchlib.rust)
<!-- badges: end -->

## Description

This is a reimplementation of [pp-sketchlib](https://github.com/bacpop/pp-sketchlib)
Expand Down
7 changes: 7 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
coverage:
precision: 2
round: down
range: "70...100"

ignore:
- "tests/**/*"
16 changes: 16 additions & 0 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,22 @@ pub enum Commands {
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
level: AaLevel,
},

/// Delete genome(s) from a database (input: one id per line)
Delete {
/// Sketching database
#[arg(required = true)]
db: String,

/// Input file with IDs to delete (one ID per line)
#[arg(required = true)]
samples: String,

/// output file name
#[arg(required = true)]
output_file: String,
},

/// Print information about a .skm file
Info {
/// Sketch metadata file (.skm) to describe
Expand Down
34 changes: 34 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ pub mod utils;
use std::fs::{File, OpenOptions};
use std::io::copy;

use std::io::BufRead;
use std::path::Path;

/// Default k-mer size for (genome) sketching
pub const DEFAULT_KMER: usize = 17;
/// Chunk size in parallel distance calculations
Expand Down Expand Up @@ -488,6 +491,37 @@ pub fn main() -> Result<(), Error> {
Ok(())
}

Commands::Delete {
db,
samples,
output_file,
} => {
let ref_db = utils::strip_sketch_extension(db);

log::info!("Reading input genomes");
let path = Path::new(samples);
let file = File::open(path)?;
let reader = std::io::BufReader::new(file);

// Read in genomes to
let ids: Vec<String> = reader.lines().map_while(Result::ok).collect();

log::info!("Reading input metadata");
let mut sketches: MultiSketch = MultiSketch::load(ref_db)
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db));

// write new .skm
sketches.remove_metadata(output_file, &ids)?;

// remove samples from .skd file
log::info!("Remove genomes and writing output");
sketches.remove_genomes(ref_db, output_file, &ids)?;

log::info!("Finished writing filtered sketch data to {}", output_file);

Ok(())
}

Commands::Info {
skm_file,
sample_info,
Expand Down
81 changes: 75 additions & 6 deletions src/multisketch.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
use core::panic;
use anyhow::bail;
use anyhow::Error;
use anyhow::{Result, anyhow};
// use thiserror::Error;
use core::panic;
use std::fmt;
use std::fs::File;
use std::io::{BufReader, BufWriter};
Expand All @@ -12,6 +15,7 @@ use crate::hashing::HashType;
use crate::sketch::{Sketch, BBITS};
use crate::sketch_datafile::SketchArrayFile;

use std::collections::HashSet;
#[derive(Serialize, Deserialize)]
pub struct MultiSketch {
pub sketch_size: u64,
Expand Down Expand Up @@ -147,11 +151,6 @@ impl MultiSketch {
s1_slice
}

pub fn remove_sketches(&self, ids: &[String]) {
// TODO: remove sketch bins which belong to the duplicate ids
todo!();
}

pub fn is_compatible_with(&self, sketch2: &Self) -> bool {
self.kmer_lengths() == sketch2.kmer_lengths()
&& self.sketch_size == sketch2.sketch_size
Expand Down Expand Up @@ -196,6 +195,76 @@ impl MultiSketch {

self
}
pub fn remove_metadata(
&mut self,
output_file_name: &str,
genome_ids_to_remove: &[String],
) -> anyhow::Result<()> {
let mut new_sketch_metadata: Vec<Sketch> =
Vec::with_capacity(self.sketch_metadata.len() - genome_ids_to_remove.len());
let mut removed_samples = Vec::new();

for sketch in &self.sketch_metadata {

if !genome_ids_to_remove.contains(&(*sketch.name()).to_string()) {
new_sketch_metadata.push(sketch.clone());

} else {
removed_samples.push(sketch.name());
}
}

let set1: HashSet<&str> = removed_samples.iter().map(AsRef::as_ref).collect();
let set2: HashSet<&str> = genome_ids_to_remove.iter().map(AsRef::as_ref).collect();
let missing: Vec<&&str> = set2.difference(&set1).collect();
if !missing.is_empty() {
bail!("The following samples have not been found in the database: {:?}", missing);
}

self.sketch_metadata = new_sketch_metadata;
self.save_metadata(output_file_name)?;
Ok(())
}

pub fn remove_genomes(
&mut self,
input_prefix: &str,
output_file: &str,
genome_ids_to_remove: &[String],
) -> anyhow::Result<()> {
let mut positions_to_remove = Vec::new();
let mut missing_ids = Vec::new();

for id in genome_ids_to_remove {
if let Some(&position) = self.name_map.get(id) {
positions_to_remove.push(position);
} else {
missing_ids.push(id);
}
}

if !missing_ids.is_empty() {
bail!("The following genome IDs were not found: {:?}", missing_ids);
}

// Create a list of indices to keep
let indices_to_keep: Vec<usize> = (0..self.sketch_metadata.len())
.filter(|&idx| !positions_to_remove.contains(&idx))
.collect();

let input_filename = format!("{}.skd", input_prefix);
let output_filename = format!("{}.skd", output_file);
if let Err(e) = SketchArrayFile::write_batch(
&input_filename,
&output_filename,
&indices_to_keep,
self.sample_stride,
) {
return Err(anyhow!("Error during batch write: {}", e));
}

Ok(())
}

// This function is called when sketches are merged, not when they are
// first sketched (this is handled by sketch::sketch_files())
Expand Down
16 changes: 14 additions & 2 deletions src/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ use crate::sketch_datafile::SketchArrayFile;
#[cfg(feature = "3di")]
use crate::structures::pdb_to_3di;

use std::path::Path;

/// Bin bits (lowest of 64-bits to keep)
pub const BBITS: u64 = 14;
/// Total width of all bins (used as sign % sign_mod)
Expand Down Expand Up @@ -59,7 +61,7 @@ impl Sketch {
let mut minhash_sum = 0.0;
let mut densified = false;
let num_bins: u64 = sketch_size * (u64::BITS as u64);
let bin_size: u64 = (SIGN_MOD + num_bins - 1) / num_bins;
let bin_size: u64 = SIGN_MOD.div_ceil(num_bins);
for k in kmer_lengths {
log::debug!("Running sketching at k={k}");
// Setup storage for each k
Expand Down Expand Up @@ -150,6 +152,16 @@ impl Sketch {
}
}

fn remove_extension(n: &str) -> String {
let stem = Path::new(n).file_stem().unwrap().to_str().unwrap();
stem.strip_suffix(".fa")
.or_else(|| stem.strip_suffix(".fasta"))
.or_else(|| stem.strip_suffix(".fa.gz"))
.or_else(|| stem.strip_suffix(".fasta.gz"))
.unwrap_or(stem)
.to_string()
}

#[inline(always)]
fn universal_hash(s: u64, t: u64) -> u64 {
let x = s
Expand Down Expand Up @@ -290,7 +302,7 @@ pub fn sketch_files(
let sample_name = if concat_fasta {
format!("{name}_{}", idx + 1)
} else {
name.to_string()
Sketch::remove_extension(name)
};
if hash_it.seq_len() == 0 {
panic!("{sample_name} has no valid sequence");
Expand Down
27 changes: 27 additions & 0 deletions src/sketch_datafile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,33 @@ impl SketchArrayFile {
flat_sketch_array
}

pub fn write_batch(
input_filename: &str,
output_filename: &str,
sample_indices: &[usize],
sample_stride: usize,
) -> Result<(), Box<dyn std::error::Error>> {
let mmap = Self::memmap_file(input_filename)
.unwrap_or_else(|_| panic!("Could not memory map {input_filename}"));

let output_file = File::create(output_filename)?;
let mut writer = BufWriter::new(output_file);

for &sample_idx in sample_indices {
let mut sample_data = Vec::with_capacity(sample_stride);
for bin_idx in 0..sample_stride {
let start_byte = (sample_idx * sample_stride + bin_idx) * mem::size_of::<u64>();
let bin_val =
u64::from_le_bytes(*array_ref![mmap, start_byte, mem::size_of::<u64>()]);
sample_data.push(bin_val);
}
Self::write_sketch_data(&mut writer, &sample_data)?;
}

writer.flush()?;
Ok(())
}

fn memmap_file(filename: &str) -> Result<Mmap, Box<dyn Error>> {
let file = File::open(filename)?;
let mmap = unsafe { Mmap::map(&file)? };
Expand Down
44 changes: 30 additions & 14 deletions tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,36 @@ pub struct TestSetup {
impl TestSetup {
pub fn setup() -> Self {
let wd = assert_fs::TempDir::new().unwrap();
wd.child(SYM_IN)
.symlink_to_dir(
Path::new(FILE_IN)
.canonicalize()
.expect("Could not link expected files"),
)
.unwrap();
wd.child(SYM_TEST)
.symlink_to_dir(
Path::new(FILE_TEST)
.canonicalize()
.expect("Could not link expected files"),
)
.unwrap();

let file_in_path = Path::new(FILE_IN);
let file_test_path = Path::new(FILE_TEST);

// Attempt to create symlink for SYM_IN
match wd
.child(SYM_IN)
.symlink_to_dir(file_in_path.canonicalize().unwrap_or_else(|e| {
panic!(
"Failed to canonicalize FILE_IN path: {:?}. Error: {:?}",
file_in_path, e
);
})) {
Ok(_) => println!("Successfully created symlink for {}", SYM_IN),
Err(e) => println!("Failed to create symlink for {}: {:?}", SYM_IN, e),
}

// Attempt to create symlink for SYM_TEST
match wd
.child(SYM_TEST)
.symlink_to_dir(file_test_path.canonicalize().unwrap_or_else(|e| {
panic!(
"Failed to canonicalize FILE_TEST path: {:?}. Error: {:?}",
file_test_path, e
);
})) {
Ok(_) => println!("Successfully created symlink for {}", SYM_TEST),
Err(e) => println!("Failed to create symlink for {}: {:?}", SYM_TEST, e),
}

Self { wd }
}

Expand Down
Loading

0 comments on commit 38a2a28

Please sign in to comment.