Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into lvreynoso/diamond_sen…
Browse files Browse the repository at this point in the history
…sitive
  • Loading branch information
lvreynoso committed May 9, 2024
2 parents 4b64746 + 6d4a2be commit eff1869
Show file tree
Hide file tree
Showing 61 changed files with 20,264 additions and 445 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/index-generation-cargo-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Index Generation NCBI Compress cargo tests

on:
push:
paths:
- 'workflows/index-generation/ncbi-compress/**'

env:
LC_ALL: C.UTF-8
LANG: C.UTF-8
DEBIAN_FRONTEND: noninteractive

jobs:
cargo-test:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v2
- name: docker login ghcr.io
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: docker build + push to ghcr.io
run: |
TAG=$(git describe --long --tags --always --dirty)
IMAGE_NAME=czid-index-generation-public
IMAGE_URI="ghcr.io/${GITHUB_REPOSITORY}/${IMAGE_NAME}"
CACHE_FROM=""; docker pull "$IMAGE_URI" && CACHE_FROM="--cache-from $IMAGE_URI"
./scripts/docker-build.sh "workflows/index-generation" --tag "${IMAGE_URI}:${TAG}" $CACHE_FROM \
|| ./scripts/docker-build.sh "workflows/index-generation" --tag "${IMAGE_URI}:${TAG}"
docker push "${IMAGE_URI}:${TAG}"
if [[ ${GITHUB_REF##*/} == "main" ]]; then
docker tag "${IMAGE_URI}:${TAG}" "${IMAGE_URI}:latest"
docker push "${IMAGE_URI}:latest"
fi
echo "IMAGE_URI=${IMAGE_URI}" >> $GITHUB_ENV
echo "TAG=${TAG}" >> $GITHUB_ENV
- name: run cargo tests
run: |
make cargo-test-index-generation
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ test-%: ## run miniwdl step tests eg. make test-short-read-mngs

integration-test-%: ## run miniwdl integration tests eg. make integration-test-short-read-mngs
pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test


.PHONY: cargo-test-index-generation
cargo-test-index-generation: ## run cargo test for index-generation
(cd workflows/index-generation/ncbi-compress && cargo test)

.PHONY: test
test: ## run miniwdl step tests for all workflows eg. make test
for i in $$(ls workflows); do $(MAKE) test-$$i; done
Expand All @@ -83,12 +87,15 @@ python-dependencies: .python_dependencies_installed # install python dependencie
run: build python-dependencies ## run a miniwdl workflow. eg. make run WORKFLOW=consensus-genome. args: WORKFLOW,EXTRA_INPUT,INPUT,TASK_CMD
if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
RUNFILE="local_driver.wdl"; \
elif [ $$(ls workflows/$(WORKFLOW)/*.wdl | wc -l) -eq 1 ]; then \
RUNFILE=$$(ls workflows/$(WORKFLOW)/*.wdl); \
RUNFILE=$$(basename $$RUNFILE); \
elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
RUNFILE="$(WORKFLOW).wdl"; \
else \
RUNFILE="run.wdl"; \
fi; \
.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(EXTRA_INPUTS) $(INPUT) $(TASK_CMD)
.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(INPUT) $(EXTRA_INPUTS) $(TASK_CMD)

.PHONY: miniwdl-explore
miniwdl-explore: ## !ADVANCED! explore a previous miniwdl workflow run in the cli. eg. make miniwdl-explore OUTPATH='/mnt/path/to/output/'
Expand Down
4 changes: 3 additions & 1 deletion scripts/release.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash

set -euo pipefail

set -euxo pipefail

if [[ $# != 3 ]]; then
echo "This script creates and pushes a git tag representing a new workflow release."
Expand Down Expand Up @@ -39,6 +40,7 @@ echo "$RELEASE_NOTES" >> $TAG_MSG
git log --pretty=format:%s ${OLD_TAG}..HEAD >> $TAG_MSG || true
if ! git config --get user.name ; then
git config user.name "CZ ID release action triggered by ${GITHUB_ACTOR:-$(whoami)}"
git config user.email "${GITHUB_ACTOR:-$(whoami)}@users.noreply.github.com"
fi
git config --get user.name
git tag --annotate --file $TAG_MSG "$TAG"
Expand Down
72 changes: 52 additions & 20 deletions workflows/index-generation/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
# syntax=docker/dockerfile:1.4
ARG ARIA2_VERSION=1.36.0
ARG DEBIAN_FRONTEND=noninteractive

FROM ubuntu:22.04 AS builder
ARG ARIA2_VERSION
ARG DEBIAN_FRONTEND
# Install build dependencies
RUN apt-get update && \
apt-get install -y \
curl \
git \
## aria2
libssl-dev \
liblz4-tool \
software-properties-common \
pkg-config \
## diamond
g++ \
gdb \
Expand All @@ -27,11 +20,6 @@ RUN apt-get update && \
libz-dev \
zip

# install aria2
RUN curl -L https://github.com/aria2/aria2/releases/download/release-${ARIA2_VERSION}/aria2-${ARIA2_VERSION}.tar.gz -o aria2-${ARIA2_VERSION}.tar.gz && \
tar xzvf aria2-${ARIA2_VERSION}.tar.gz
RUN cd /aria2-${ARIA2_VERSION} && ./configure --without-gnutls --with-openssl && make

# install minimap2
RUN git clone --single-branch --branch distributed-mapping https://github.com/mlin/minimap2.git \
&& cd minimap2 && make
Expand All @@ -43,33 +31,77 @@ RUN curl -L https://github.com/shenwei356/seqkit/releases/download/v2.0.0/seqkit
RUN git clone --single-branch --branch scatter-gather https://github.com/morsecodist/diamond \
&& mkdir diamond/build && cd diamond/build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j6

# install aws cli
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
&& unzip awscliv2.zip && ./aws/install -i /usr/local/aws-cli -b /usr/local/bin

FROM ubuntu:22.04 AS ncbi_compress_builder

RUN apt-get update && \
apt-get install -y \
build-essential \
wget \
## rocksdb
libgflags-dev \
libsnappy-dev \
zlib1g-dev \
libbz2-dev \
liblz4-dev \
libzstd-dev \
clang

# based on https://github.com/rust-lang/docker-rust/blob/dcb74d779e8a74263dc8b91d58d8ce7f3c0c805b/1.70.0/buster/Dockerfile
ENV RUSTUP_HOME=/usr/local/rustup \
CARGO_HOME=/usr/local/cargo \
PATH=/usr/local/cargo/bin:$PATH \
RUST_VERSION=1.70.0

RUN set -eux; \
dpkgArch="$(dpkg --print-architecture)"; \
case "${dpkgArch##*-}" in \
amd64) rustArch='x86_64-unknown-linux-gnu'; rustupSha256='0b2f6c8f85a3d02fde2efc0ced4657869d73fccfce59defb4e8d29233116e6db' ;; \
armhf) rustArch='armv7-unknown-linux-gnueabihf'; rustupSha256='f21c44b01678c645d8fbba1e55e4180a01ac5af2d38bcbd14aa665e0d96ed69a' ;; \
arm64) rustArch='aarch64-unknown-linux-gnu'; rustupSha256='673e336c81c65e6b16dcdede33f4cc9ed0f08bde1dbe7a935f113605292dc800' ;; \
i386) rustArch='i686-unknown-linux-gnu'; rustupSha256='e7b0f47557c1afcd86939b118cbcf7fb95a5d1d917bdd355157b63ca00fc4333' ;; \
*) echo >&2 "unsupported architecture: ${dpkgArch}"; exit 1 ;; \
esac; \
url="https://static.rust-lang.org/rustup/archive/1.26.0/${rustArch}/rustup-init"; \
wget "$url"; \
echo "${rustupSha256} *rustup-init" | sha256sum -c -; \
chmod +x rustup-init; \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${rustArch}; \
rm rustup-init; \
chmod -R a+w $RUSTUP_HOME $CARGO_HOME; \
rustup --version; \
cargo --version; \
rustc --version;

COPY ncbi-compress /ncbi-compress
RUN cd /ncbi-compress && cargo build --release

FROM ubuntu:22.04
ARG ARIA2_VERSION
ARG DEBIAN_FRONTEND
# Install dependencies

RUN apt-get update && \
apt-get install -y \
wget \
pigz \
python3-pip \
mysql-client \
kraken2 \
curl \
jq

RUN wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.15.0+-x64-linux.tar.gz && \
tar -xzvf ncbi-blast-2.15.0+-x64-linux.tar.gz && \
cp ncbi-blast-2.15.0+/bin/* /usr/local/bin/

COPY --from=builder /diamond/build/diamond /usr/local/bin
COPY --from=builder /minimap2/minimap2 /usr/local/bin/
COPY --from=builder /aria2-${ARIA2_VERSION}/src/aria2c /usr/local/bin/
COPY --from=builder /seqkit /usr/local/bin/
COPY --from=builder /usr/local/aws-cli /usr/local/aws-cli
COPY --from=builder /usr/local/bin/aws /usr/local/bin
COPY --from=ncbi_compress_builder /ncbi-compress/target/release/ncbi-compress /usr/local/bin

COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

COPY *.py /usr/local/bin/
COPY ncbi_download /usr/local/bin/

WORKDIR /workspace
96 changes: 96 additions & 0 deletions workflows/index-generation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Index Generation

### Directory overview:

#### **index-generation.wdl**:
workflow code to create the following assets:
* NT and NR indexes from NCBI with redundant sequences removed - we use this to develop additional indexes (notebaly minimap and diamond indexes).
Subsequently, we compile a "short DB" comprising solely the accessions that were identified from non-host alignment. This database is then used
to blast (blastx for NR blastn for NT) assembled contigs during the mNGS postprocessing phase.

* By using blastx we translates nucleotide hits from NT into protein sequences using the three different reading frames, we then search this sequence against the protein database (NR in this case). We also get out alignment statistics which we display in the mNGS sample report.
* By using blastn we can find regions of similarity between nucleotide sequences (in this case our contigs and NT). We also get out alignment statistics which we display in the mNGS sample report.

* nt_loc.marisa and nr_loc.marisa:
* index to quickly access an accession and it's sequence from either NT or NR.
* This is used in consensus genome and phylotree workflows to pull reference accessions (when CG run is kicked off from the mNGS report)
* Generating the JSON file for read alignment visualizations
* In postprocess step of mNGS (where we generate taxon_counts) this file is used to download accessions based on hit summary from diamond and minimap to create the "short DB" mentioned above.
* nt_info.marisa and nr_info.marisa:
* index to quickly go from accession ID to accession name, sequence length, and offset information from either NT and NR
* mainly used to generate JSON files for coverage viz to be consumed by the web app.
* Minimap indexes:
* chunked minimap index - used for non host alignment to generate hits to NT
* diamond indexes:
* chunked diamond index - used for non host alignment to generate hits to NR
* accession2taxid.marisa:
* index to quickly go from accession ID to taxon ID
* used for determine taxon assignment for each read from hits generated from minimap and diamond.
* taxid-lineages.marisa:
* index used to go from tax ID to taxonomy IDs (taxid for species, genus, family)
* used for determining the optimal taxon assignment for each read from the alignment
results (calling hits), for example if a read aligns to multiple distinct references, we need to assess at which level in the taxonomic hierarchy the multiple alignments reach consensus.
* We also use this file for generating taxon counts in the postprocess step of mNGSs
* deuterostome_taxids.txt - used to filter out eukaryotic sequences which helps narrow down taxon_counts to microbial DNA (bacteria, viruses, fungi, and parasites).
* taxon_ignore_list.txt - taxa that we would like to ignore (synthetic, constructs, plasmids, vectors, etc) in taxon_counts from non host alignment


#### **ncbi-compress**:
compression code written in rust to remove redundant sequences from NT and NR

#### **helpful_for_analysis**
jupyter notebooks used for helpful common analysis steps including:
* querying NCBI for an accession and lineage (used to investigate reads in the "all taxa with neither family nor genus classification" report for mNGS)
* querying marisa trie files - notebook to easily query all marisa trie files generated from index generation workflow above.
* compare non host alignment times between two projects - this was used to benchmark how long it took to do non host alignment on old and new indexes.
* generate taxon lineage changelog for a sample report - get a readout on which reads from a sample report have a taxon / lineage change between new and old index runs. Used for comp bio validation purposes mainly.
* checking sequence retention for different index compression runs - this notebook was handy for running multiple compression runs and summarizing which reads were dropped, helpful for early analysis and benchmarking of compression workflow.

#### **ncbitax2lin.py**
used to generate taxid-lineages.marisa

#### **generate_accession2taxid.py**
used to generate accession2taxis.marisa

#### **generate_ncbi_db_index.py**
used to generate nt_loc.marisa and nr_loc.marisa

#### **generate_lineage_csvs.py**
used to generate versioned-taxid-lineages.csv file used to populate taxon_lineage database table, also generates changelogs for deleted taxa, changed taxa, and new taxa.

### Updating The Webapp to use the new Index Generation Assets:
* Follow the wiki page [here](https://github.com/chanzuckerberg/czid-web-private/wiki/%5BDEV%5D-How-to-update-the-webapp-to-use-new-index-assets-and-taxon_lineage-table-update) to update the webapp to use new assets and switch over all projects to start pinning the new index version.

### Debugging Notes for EC2:
* usually you need to launch an EC2 instance to test this workflow out at scale: `aegea launch --instance-type i3.16xlarge --no-provision-user --storage /=128GB --iam-role idseq-on-call <instance-name>`
* install rust: `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh`
* install python:
```
curl -O https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-x86_64.sh
chmod u+x Anaconda3-2023.09-0-Linux-x86_64.sh
./Anaconda3-2023.09-0-Linux-x86_64.sh
```

* if running miniwdl:
make sure the version is 1.5.1 (`pip3 install miniwdl==1.5.1`)
* downgrade importlib-metadata: `pip3 install importlib-metadata==4.13.0`

* if building the images on an EC2 machine:
Follow [these instructions](https://docs.docker.com/engine/install/ubuntu/) to update docker before building

* change docker build directory to be on /mnt:
```
sudo systemctl stop docker
sudo mv /var/lib/docker /mnt/docker
sudo ln -s /mnt/docker /var/lib/docker
sudo systemctl start docker
```

* add the current user to the docker group:
* `sudo usermod -aG docker $USER`
* logout and login to reflect new group status

* build index-generation docker image: `make rebuild WORKFLOW=index-generation`
* run and exec into container: `docker run -v /mnt:/mnt --rm -it index-generation:latest bash`
* to run a certain task in index-geneation.wdl with miniwdl:
* `miniwdl run index-generation.wdl --task GenerateLocDB --input generate_loc_db_input.json`
Loading

0 comments on commit eff1869

Please sign in to comment.