Skip to content

Commit

Permalink
Merge pull request #22 from bacpop/johanna_dev
Browse files Browse the repository at this point in the history
Add command to merge sketch databases
  • Loading branch information
johnlees authored Aug 5, 2024
2 parents 9a67cca + 40411d5 commit 35e4258
Show file tree
Hide file tree
Showing 25 changed files with 656 additions and 16 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Cargo Build & Test

on:
push:

env:
CARGO_TERM_COLOR: always

jobs:
build_and_test:
name: Rust project - latest
runs-on: ubuntu-latest
strategy:
matrix:
toolchain:
- stable
- beta
- nightly
steps:
- uses: actions/checkout@v3
- run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }}
- run: cargo build --verbose
- run: cargo test --verbose
16 changes: 16 additions & 0 deletions .github/workflows/clippy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
on: push
name: Clippy check
jobs:
clippy_check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
with:
toolchain: nightly
components: clippy
override: true
- uses: actions-rs/clippy-check@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
args: --all-features
24 changes: 24 additions & 0 deletions .github/workflows/version.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
on:
pull_request:
branches:
- master

# This runs on PRs so error can be seen before merging
name: Version check

jobs:
all:
runs-on: ubuntu-latest

name: Version check

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Check version format and availability
run: ./scripts/version_check.sh
60 changes: 60 additions & 0 deletions scripts/version_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env bash
set -e
# Usage:
# check_version.sh
#
# Reads version from Cargo.toml and checks it against tags
#
# Credit to @richfitz for this from the dust package:
# https://github.com/mrc-ide/dust/blob/master/scripts/version_check
VERSION=${1:-$(grep '^version' Cargo.toml | sed 's/.*= *//' | sed 's/"//g')}
TAG="v${VERSION}"

echo "Proposed version number '$VERSION'"

if echo "$VERSION" | grep -Eq "[0-9]+[.][0-9]+[.][0-9]+"; then
echo "[OK] Version number in correct format"
else
echo "[ERROR] Invalid format version number '$VERSION' must be in format 'x.y.z'"
exit 1
fi

EXIT_CODE=0

echo "Updating remote git data"
git fetch --quiet

BRANCH_DEFAULT=$(git remote show origin | awk '/HEAD branch/ {print $NF}')
LAST_TAG=$(git describe --tags --abbrev=0 "origin/${BRANCH_DEFAULT}")

echo "Last tag was $LAST_TAG"

if git rev-parse "$TAG" >/dev/null 2>&1; then
echo "[ERROR] Tag $TAG already exists - update version number in Cargo.toml"
exit 1
else
echo "[OK] Version number not yet present as git tag"
fi

MAJOR=$(echo $VERSION | cut -d. -f1)
MINOR=$(echo $VERSION | cut -d. -f2)
PATCH=$(echo $VERSION | cut -d. -f3)

LAST_VERSION=$(echo "$LAST_TAG" | sed 's/^v//')
LAST_MAJOR=$(echo $LAST_VERSION | cut -d. -f1)
LAST_MINOR=$(echo $LAST_VERSION | cut -d. -f2)
LAST_PATCH=$(echo $LAST_VERSION | cut -d. -f3)

if (( $MAJOR > $LAST_MAJOR )); then
echo "[OK] Increasing MAJOR version"
exit $EXIT_CODE
elif (( $MINOR > $LAST_MINOR )); then
echo "[OK] Increasing MINOR version"
exit $EXIT_CODE
elif (( $PATCH > $LAST_PATCH )); then
echo "[OK] Increasing PATCH version"
exit $EXIT_CODE
else
echo "[ERROR] Version number has not increased relative to $LAST_VERSION"
exit 1
fi
18 changes: 14 additions & 4 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,20 @@ pub enum Commands {
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
threads: usize,
},
// TODO add a merge mode
// TODO add a concat mode (add sketch to existing DB)
// TODO add a delete mode
// TODO add a reorder mode
/// Merge two sketch files (.skm and .skd pair)
Merge {
/// The first .skd (sketch data) file
#[arg(required = true)]
db1: String,

/// The second .skd (sketch data) file
#[arg(required = true)]
db2: String,

/// Output filename for the merged sketch
#[arg(required = true, short)]
output: String,
},
/// Print information about a .skm file
Info {
/// Sketch metadata file (.skm) to describe
Expand Down
49 changes: 41 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::time::Instant;
#[macro_use]
extern crate arrayref;
extern crate num_cpus;
use anyhow::Error;
use indicatif::{ParallelProgressIterator, ProgressStyle};
use rayon::prelude::*;

Expand Down Expand Up @@ -37,13 +38,15 @@ use crate::io::{get_input_list, parse_kmers, read_subset_names, set_ostream};
pub mod bloom_filter;
pub mod hashing;

pub mod utils;

/// Default k-mer size for (genome) sketching
pub const DEFAULT_KMER: usize = 17;
/// Chunk size in parallel distance calculations
pub const CHUNK_SIZE: usize = 1000;

#[doc(hidden)]
pub fn main() {
pub fn main() -> Result<(), Error> {
let args = cli_args();
if args.verbose {
simple_logger::init_with_level(log::Level::Info).unwrap();
Expand All @@ -54,7 +57,7 @@ pub fn main() {

let mut print_success = true;
let start = Instant::now();
match &args.command {
let result = match &args.command {
Commands::Sketch {
seq_files,
file_list,
Expand Down Expand Up @@ -115,6 +118,7 @@ pub fn main() {
sketch_vec
.save_metadata(output)
.expect("Error saving metadata");
Ok(())
}
Commands::Dist {
ref_db,
Expand All @@ -130,11 +134,8 @@ pub fn main() {

let mut output_file = set_ostream(output);

let ref_db_name = if ref_db.ends_with(".skm") || ref_db.ends_with(".skd") {
&ref_db[0..ref_db.len() - 4]
} else {
ref_db.as_str()
};
let ref_db_name = utils::strip_sketch_extension(ref_db);

let mut references = MultiSketch::load(ref_db_name)
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {ref_db}.skm"));

Expand Down Expand Up @@ -363,7 +364,37 @@ pub fn main() {
write!(output_file, "{distances}").expect("Error writing output distances");
}
}
Ok(())
}
Commands::Merge { db1, db2, output } => {
let ref_db_name1 = utils::strip_sketch_extension(db1);
let ref_db_name2 = utils::strip_sketch_extension(db2);

log::info!("Reading input metadata");
let mut sketches1: MultiSketch = MultiSketch::load(ref_db_name1).unwrap_or_else(|_| {
panic!("Could not read sketch metadata from {}.skm", ref_db_name1)
});

let sketches2: MultiSketch = MultiSketch::load(ref_db_name2).unwrap_or_else(|_| {
panic!("Could not read sketch metadata from {}.skm", ref_db_name2)
});
// check compatibility
if !sketches1.is_compatible_with(&sketches2) {
panic!("Databases are not compatible for merging.")
}

log::info!("Merging metadata to {}.skm", output);
let merged_sketch = sketches1.merge_sketches(&sketches2);
// merge metadata
merged_sketch
.save_metadata(output)
.unwrap_or_else(|_| panic!("Couldn't save metadata to {}", output));

// merge actual sketch data
log::info!("Merging and saving sketch data to {}.skd", output);
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
}

Commands::Info {
skm_file,
sample_info,
Expand All @@ -384,8 +415,9 @@ pub fn main() {
println!("{sketches:?}");
}
print_success = false; // Turn the final message off
Ok(())
}
}
};
let end = Instant::now();

log::info!("Complete");
Expand All @@ -395,4 +427,5 @@ pub fn main() {
end.duration_since(start).as_secs()
);
}
result
}
3 changes: 1 addition & 2 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use anyhow::Error;

fn main() -> Result<(), Error> {
sketchlib::main();
Ok(())
sketchlib::main()
}
Loading

0 comments on commit 35e4258

Please sign in to comment.