Skip to content

Commit

Permalink
Merge pull request #34 from bacpop/release
Browse files Browse the repository at this point in the history
Prepare for first release
  • Loading branch information
johnlees authored Jan 27, 2025
2 parents 37a5879 + 7ba0943 commit 948d240
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 20 deletions.
27 changes: 13 additions & 14 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,28 @@ jobs:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
toolchain: nightly
toolchain: stable
override: true
components: llvm-tools-preview # Required for grcov

- name: Build
run: cargo build --verbose

- name: Run tests
run: cargo test --verbose --no-fail-fast
- name: Install cargo-llvm-cov and run tests
run: cargo install cargo-llvm-cov && cargo llvm-cov --lcov --output-path=./lcov.info
env:
CARGO_INCREMENTAL: '0'
RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
RUSTFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'
RUSTDOCFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'

- name: Run grcov
run: |
cargo install grcov
grcov . -s . --binary-path ./target/debug/ -t lcov --branch --ignore-not-existing --ignore "/*" -o lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
- name: Codecov
# You may pin to the exact commit or the version.
uses: codecov/[email protected]
with:
# Repository upload token - get it from codecov.io. Required only for private repositories
token: ${{ secrets.CODECOV_TOKEN }}
files: lcov.info
fail_ci_if_error: true
file: ./lcov.info
# Specify whether the Codecov output should be verbose
verbose: true
fail_ci_if_error: true

93 changes: 93 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
name: Make release

on:
push:
tags:
- "v*.*.*"

env:
CARGO_TERM_COLOR: always
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

jobs:

build-binaries:
runs-on: ${{ matrix.config.os }}

name: Release ${{ matrix.config.os }} (${{ matrix.config.toolchain }})

strategy:
fail-fast: false
matrix:
config:
- {os: macOS-latest, toolchain: 'stable'}
- {os: ubuntu-latest, toolchain: 'stable'}

steps:
- uses: actions/checkout@v3

- name: Install rust toolchain
uses: actions-rs/toolchain@v1
with:
toolchain: ${{ matrix.config.toolchain }}
override: true

# NB see https://github.com/actions-rs/cargo if we ever want to try cross
# e.g. for Mac M1/arm64
- name: Build and package binary
shell: bash
run: |
cargo install --path .
cp $HOME/.cargo/bin/sketchlib .
tar czvf sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz sketchlib LICENSE NOTICE README.md
- name: Upload package
if: success()
uses: actions/upload-artifact@v4
with:
name: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}
path: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz

create-release:
runs-on: ubuntu-latest

needs: build-binaries

steps:
- uses: actions/checkout@v2

- uses: actions/download-artifact@v4
with:
path: build

- name: Organise files
shell: bash
run: |
cp build/sketchlib-${{ github.ref_name }}-macOS-latest-stable/sketchlib-${{ github.ref_name }}-macOS-latest-stable.tar.gz .
cp build/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable.tar.gz .
- name: Create release
id: create_release
uses: softprops/action-gh-release@v1
with:
name: Release ${{ github.ref_name }}
draft: false
prerelease: false
fail_on_unmatched_files: true
generate_release_notes: true
files: |
sketchlib-*.tar.gz
push_crate:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- uses: katyo/publish-crates@v1
if: startsWith(github.ref, 'refs/tags/')
with:
registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
2 changes: 1 addition & 1 deletion .github/workflows/version.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
on:
pull_request:
branches:
- master
- main

# This runs on PRs so error can be seen before merging
name: Version check
Expand Down
8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
[package]
name = "sketchlib"
version = "0.1.0"
version = "0.1.1"
authors = [
"John Lees <[email protected]>",
"Nicholas Croucher <[email protected]>"
"Nicholas Croucher <[email protected]>",
"Johanna von Wachsmann <[email protected]>",
"Victor Rodriguez Bouza <[email protected]>",
]
edition = "2021"
description = "Genome and amino-acid sketching"
Expand Down Expand Up @@ -65,4 +67,4 @@ assert_fs = "1.0.10"
pretty_assertions = "1.3.0"

[profile.release]
debug = true
lto = true
59 changes: 58 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,64 @@
## Description

This is a reimplementation of [pp-sketchlib](https://github.com/bacpop/pp-sketchlib)
in the rust language.
in the rust language. This version is optimised for larger sample numbers, particularly
allowing subsets of samples to be compared.

Sketch databases have two files: `.skm` which is the metadata (samples names, base counts etc)
and `.skd` which is the actual sketch data.

## Usage
With all options we typically recommend using `-v` to see all progress during the run.

### Sketching

Using input fasta/fastq files, create a sketch database. Run `sketchlib sketch -h` to see the help.

- List .fasta files on the command line, or use `-f` to provide a file(s). From file,
these are one line per sample listing the name and fasta file, or name and two read files
(fastq). Inputs can be gzipped or not, this is automatically detected.
- To set the k-mer size in the sketch database you can either give a list of sizes with `--k-vals`
or a sequence `--k-seq` with start,stop,step. e.g. `--k-seq 17,29,4` would sketch at k=17, 21, 25 and 29.
- Set the sketch size with `-s`. Typically 1000 is enough for species level resolution, 10000 for within-species/strain
resolution and 100000-1000000 for SNP level resolution.
- To sketch amino acid sequences use `--seq-type aa --concat-fasta` if you have the typical case
of each fasta file being a multifasta with many aa sequences. Each one will then be its own sample.
- You can also sketch structures with .pdb input, see 'Enabling PDB->3Di' below. This is experimental.

### Distances

To compute internal all-vs-all core and accessory distances use:
```
sketchlib dist db_name
```
Note the database names can be the prefix, or the full path to the .skm file. The output
is in pairwise 'long' format, which lists the upper triangle of the distance matrix row-by-row.

To calculate distances between two different sample sets, each in their own sketch database, use:
```
sketchlib dist db1 db2
```
For example, if you want to query distances of a new sample against an existing database,
first sketch the new sample with e.g. `sketchlib sketch -o db2 new_sample.fasta`, then
run the above command.

Modifiers:
- Use `-k` to calculate Jaccard distance at the given k. Otherwise the default is to
calculate across multiple k and output core and accessory distances.
- Use `--ani` with `-k` to transform the Jaccard distance into average nucleotide identity.
- Use `--subset` to provide a list of sample names to include in the distance calculations,
only these sample will be loaded from the `.skd` file.
- Use `-o` to write the distances to a file. The default it to write to stdout, so you can also
use `>` to redirect to a file (progress messages are written to stderr).
- Use `--knn` to only keep this many nearest neighbour distances. For very large databases
it may be useful to keep only ~50 distances. This makes the memory use manageable. This sparse output
can be used with e.g. [mandrake](https://github.com/bacpop/mandrake).

### Other operations

- `merge` joins two existing sketch databases.
- `append` sketches new input samples, and adds them to an existing database.
- `delete` removes samples from a sketch database.

## Enabling PDB->3Di
conda doesn't work, so make sure it is deactivated
Expand Down
1 change: 1 addition & 0 deletions src/distances.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Functions and traits for calculating and storing distances
use std::cmp::Ordering;
use std::fmt;

Expand Down
1 change: 1 addition & 0 deletions src/hashing/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! [nthash](https://github.com/bcgsc/ntHash[) and [aahash](https://github.com/bcgsc/ntHash) iterators
use clap::ValueEnum;
use serde::{Deserialize, Serialize};

Expand Down
1 change: 1 addition & 0 deletions src/io.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Functions to read input fasta/fastq files
use std::fs::File;
use std::io::{stdout, BufRead, BufReader, BufWriter, Write};
use std::path::Path;
Expand Down
1 change: 1 addition & 0 deletions src/jaccard.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Implementation of Jaccard, core and accessory distance calculations
use crate::multisketch::MultiSketch;
use crate::sketch::BBITS;

Expand Down
7 changes: 6 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
//! DOCS
//! Fast distance calculations between biological sequences (DNA, AA or structures
//! via the 3di alphabet). Distances are based on bindash approximations of the Jaccard
//! distance, with the PopPUNK method to calculate core and accessory distances. nthash/aahash
//! are used for hash functions to create the sketches
//!
//! This package is a work in progress, but is mature enough for research use. See README.md
//! for current CLI usage.
// #![warn(missing_docs)]

Expand Down
1 change: 1 addition & 0 deletions src/multisketch.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! The class to support .skm/.skd reading and writing, containing multiple [`Sketch`] objects
use anyhow::bail;
use anyhow::Error;
use anyhow::{Result, anyhow};
Expand Down
1 change: 1 addition & 0 deletions src/sketch.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Methods to create single sample's sketch
use std::cmp::Ordering;
use std::fmt;
use std::sync::mpsc;
Expand Down
1 change: 1 addition & 0 deletions src/sketch_datafile.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! I/O support and memory mapping used by for lower level read/write to .skd
use memmap2::Mmap;
use std::error::Error;
use std::fs::File;
Expand Down
1 change: 1 addition & 0 deletions src/structures.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
//! Support for .pdb files and the 3di alphabet
use anyhow::Error;
use crate::io::InputFastx;

Expand Down

0 comments on commit 948d240

Please sign in to comment.