Merge remote-tracking branch 'origin/main' into lvreynoso/diamond_sen…

…sitive
chanzuckerberg · May 9, 2024 · eff1869 · eff1869
2 parents 4b64746 + 6d4a2be
commit eff1869
Show file tree

Hide file tree

Showing 61 changed files with 20,264 additions and 445 deletions.
diff --git a/.github/workflows/index-generation-cargo-test.yml b/.github/workflows/index-generation-cargo-test.yml
@@ -0,0 +1,43 @@
+name: Index Generation NCBI Compress cargo tests
+
+on:
+  push:
+    paths:
+      - 'workflows/index-generation/ncbi-compress/**'
+
+env:
+  LC_ALL: C.UTF-8
+  LANG: C.UTF-8
+  DEBIAN_FRONTEND: noninteractive
+
+jobs:
+  cargo-test:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: docker login ghcr.io
+        uses: docker/login-action@v1
+        with:
+            registry: ghcr.io
+            username: ${{ github.repository_owner }}
+            password: ${{ secrets.GITHUB_TOKEN }}
+      - name: docker build + push to ghcr.io
+        run: |
+          TAG=$(git describe --long --tags --always --dirty)
+          IMAGE_NAME=czid-index-generation-public
+          IMAGE_URI="ghcr.io/${GITHUB_REPOSITORY}/${IMAGE_NAME}"
+
+          CACHE_FROM=""; docker pull "$IMAGE_URI" && CACHE_FROM="--cache-from $IMAGE_URI"
+          ./scripts/docker-build.sh "workflows/index-generation" --tag "${IMAGE_URI}:${TAG}" $CACHE_FROM \
+            || ./scripts/docker-build.sh "workflows/index-generation" --tag "${IMAGE_URI}:${TAG}"
+          docker push "${IMAGE_URI}:${TAG}"
+          if [[ ${GITHUB_REF##*/} == "main" ]]; then
+            docker tag "${IMAGE_URI}:${TAG}" "${IMAGE_URI}:latest"
+            docker push "${IMAGE_URI}:latest"
+          fi
+
+          echo "IMAGE_URI=${IMAGE_URI}" >> $GITHUB_ENV
+          echo "TAG=${TAG}" >> $GITHUB_ENV
+      - name: run cargo tests
+        run: |
+          make cargo-test-index-generation
diff --git a/Makefile b/Makefile
@@ -60,7 +60,11 @@ test-%: ## run miniwdl step tests eg. make test-short-read-mngs
 
 integration-test-%: ## run miniwdl integration tests eg. make integration-test-short-read-mngs
 	pytest -v -n $$(( $$(nproc) - 1)) --durations=0 --tb=short --log-cli-level=11 workflows/$*/integration_test
-
+
+.PHONY: cargo-test-index-generation
+cargo-test-index-generation: ## run cargo test for index-generation
+	(cd workflows/index-generation/ncbi-compress && cargo test)
+
 .PHONY: test
 test: ## run miniwdl step tests for all workflows eg. make test
 	for i in $$(ls workflows); do $(MAKE) test-$$i; done
@@ -83,12 +87,15 @@ python-dependencies: .python_dependencies_installed # install python dependencie
 run: build python-dependencies ## run a miniwdl workflow. eg. make run WORKFLOW=consensus-genome. args: WORKFLOW,EXTRA_INPUT,INPUT,TASK_CMD
 	if [ "$(WORKFLOW)" = "short-read-mngs" ]; then \
 		RUNFILE="local_driver.wdl"; \
+	elif [ $$(ls workflows/$(WORKFLOW)/*.wdl | wc -l) -eq 1 ]; then \
+		RUNFILE=$$(ls workflows/$(WORKFLOW)/*.wdl); \
+		RUNFILE=$$(basename $$RUNFILE); \
 	elif [ -f "workflows/$(WORKFLOW)/$(WORKFLOW).wdl" ]; then \
 		RUNFILE="$(WORKFLOW).wdl"; \
 	else \
 		RUNFILE="run.wdl"; \
 	fi; \
-	.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(EXTRA_INPUTS) $(INPUT) $(TASK_CMD)
+	.venv/bin/miniwdl run workflows/$(WORKFLOW)/$$RUNFILE docker_image_id=czid-$(WORKFLOW) $(INPUT) $(EXTRA_INPUTS) $(TASK_CMD)
 
 .PHONY: miniwdl-explore
 miniwdl-explore: ## !ADVANCED! explore a previous miniwdl workflow run in the cli. eg. make miniwdl-explore OUTPATH='/mnt/path/to/output/'

diff --git a/scripts/release.sh b/scripts/release.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
-set -euo pipefail
+
+set -euxo pipefail
 
 if [[ $# != 3 ]]; then
     echo "This script creates and pushes a git tag representing a new workflow release."
@@ -39,6 +40,7 @@ echo "$RELEASE_NOTES" >> $TAG_MSG
 git log --pretty=format:%s ${OLD_TAG}..HEAD >> $TAG_MSG || true
 if ! git config --get user.name ; then
     git config user.name "CZ ID release action triggered by ${GITHUB_ACTOR:-$(whoami)}"
+    git config user.email "${GITHUB_ACTOR:-$(whoami)}@users.noreply.github.com"
 fi
 git config --get user.name
 git tag --annotate --file $TAG_MSG "$TAG"

diff --git a/workflows/index-generation/Dockerfile b/workflows/index-generation/Dockerfile
@@ -1,20 +1,13 @@
 # syntax=docker/dockerfile:1.4
-ARG ARIA2_VERSION=1.36.0
 ARG DEBIAN_FRONTEND=noninteractive
 
 FROM ubuntu:22.04 AS builder
-ARG ARIA2_VERSION
 ARG DEBIAN_FRONTEND
 # Install  build dependencies
 RUN apt-get update && \
     apt-get install -y \
     curl \
     git \
-    ## aria2
-    libssl-dev \
-    liblz4-tool \
-    software-properties-common \
-    pkg-config \
     ## diamond
     g++ \
     gdb \
@@ -27,11 +20,6 @@ RUN apt-get update && \
     libz-dev \
     zip
 
-# install aria2
-RUN curl -L https://github.com/aria2/aria2/releases/download/release-${ARIA2_VERSION}/aria2-${ARIA2_VERSION}.tar.gz -o aria2-${ARIA2_VERSION}.tar.gz && \
-    tar xzvf aria2-${ARIA2_VERSION}.tar.gz
-RUN cd /aria2-${ARIA2_VERSION} && ./configure --without-gnutls --with-openssl && make
-
 # install minimap2
 RUN git clone --single-branch --branch distributed-mapping https://github.com/mlin/minimap2.git \
     && cd minimap2 && make
@@ -43,33 +31,77 @@ RUN curl -L https://github.com/shenwei356/seqkit/releases/download/v2.0.0/seqkit
 RUN git clone --single-branch --branch scatter-gather https://github.com/morsecodist/diamond \
     && mkdir diamond/build && cd diamond/build && cmake -DCMAKE_BUILD_TYPE=Release .. && make -j6
 
-# install aws cli
-RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
-    && unzip awscliv2.zip && ./aws/install -i /usr/local/aws-cli -b /usr/local/bin
+
+FROM ubuntu:22.04 AS ncbi_compress_builder
+
+RUN apt-get update && \
+    apt-get install -y \
+    build-essential \
+    wget \
+    ## rocksdb
+    libgflags-dev \
+    libsnappy-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    liblz4-dev \
+    libzstd-dev \
+    clang
+
+# based on https://github.com/rust-lang/docker-rust/blob/dcb74d779e8a74263dc8b91d58d8ce7f3c0c805b/1.70.0/buster/Dockerfile
+ENV RUSTUP_HOME=/usr/local/rustup \
+    CARGO_HOME=/usr/local/cargo \
+    PATH=/usr/local/cargo/bin:$PATH \
+    RUST_VERSION=1.70.0
+
+RUN set -eux; \
+    dpkgArch="$(dpkg --print-architecture)"; \
+    case "${dpkgArch##*-}" in \
+        amd64) rustArch='x86_64-unknown-linux-gnu'; rustupSha256='0b2f6c8f85a3d02fde2efc0ced4657869d73fccfce59defb4e8d29233116e6db' ;; \
+        armhf) rustArch='armv7-unknown-linux-gnueabihf'; rustupSha256='f21c44b01678c645d8fbba1e55e4180a01ac5af2d38bcbd14aa665e0d96ed69a' ;; \
+        arm64) rustArch='aarch64-unknown-linux-gnu'; rustupSha256='673e336c81c65e6b16dcdede33f4cc9ed0f08bde1dbe7a935f113605292dc800' ;; \
+        i386) rustArch='i686-unknown-linux-gnu'; rustupSha256='e7b0f47557c1afcd86939b118cbcf7fb95a5d1d917bdd355157b63ca00fc4333' ;; \
+        *) echo >&2 "unsupported architecture: ${dpkgArch}"; exit 1 ;; \
+    esac; \
+    url="https://static.rust-lang.org/rustup/archive/1.26.0/${rustArch}/rustup-init"; \
+    wget "$url"; \
+    echo "${rustupSha256} *rustup-init" | sha256sum -c -; \
+    chmod +x rustup-init; \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${rustArch}; \
+    rm rustup-init; \
+    chmod -R a+w $RUSTUP_HOME $CARGO_HOME; \
+    rustup --version; \
+    cargo --version; \
+    rustc --version;
+
+COPY ncbi-compress /ncbi-compress
+RUN cd /ncbi-compress && cargo build --release
 
 FROM ubuntu:22.04
-ARG ARIA2_VERSION
 ARG DEBIAN_FRONTEND
 # Install dependencies
+
 RUN apt-get update && \
     apt-get install -y \
         wget \
         pigz \
         python3-pip \
         mysql-client \
+        kraken2 \
+        curl \
         jq
 
+RUN wget https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/ncbi-blast-2.15.0+-x64-linux.tar.gz && \
+    tar -xzvf ncbi-blast-2.15.0+-x64-linux.tar.gz && \
+    cp ncbi-blast-2.15.0+/bin/* /usr/local/bin/
+
 COPY --from=builder /diamond/build/diamond /usr/local/bin
 COPY --from=builder /minimap2/minimap2 /usr/local/bin/
-COPY --from=builder /aria2-${ARIA2_VERSION}/src/aria2c /usr/local/bin/
 COPY --from=builder /seqkit /usr/local/bin/
-COPY --from=builder /usr/local/aws-cli /usr/local/aws-cli
-COPY --from=builder /usr/local/bin/aws /usr/local/bin
+COPY --from=ncbi_compress_builder /ncbi-compress/target/release/ncbi-compress /usr/local/bin
 
 COPY requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
 
 COPY *.py /usr/local/bin/
-COPY ncbi_download /usr/local/bin/
 
 WORKDIR /workspace
diff --git a/workflows/index-generation/README.md b/workflows/index-generation/README.md
@@ -0,0 +1,96 @@
+# Index Generation
+
+### Directory overview:
+
+#### **index-generation.wdl**:
+workflow code to create the following assets:
+* NT and NR indexes from NCBI with redundant sequences removed - we use this to develop additional indexes (notebaly minimap and diamond indexes).
+  Subsequently, we compile a "short DB" comprising solely the accessions that were identified from non-host alignment. This database is then used
+  to blast (blastx for NR blastn for NT) assembled contigs during the mNGS postprocessing phase.
+
+    * By using blastx we translates nucleotide hits from NT into protein sequences using the three different reading frames, we then search this sequence against the protein database (NR in this case). We also get out alignment statistics which we display in the mNGS sample report.
+    * By using blastn we can find regions of similarity between nucleotide sequences (in this case our contigs and NT). We also get out alignment statistics which we display in the mNGS sample report.
+
+* nt_loc.marisa and nr_loc.marisa:
+  * index to quickly access an accession and it's sequence from either NT or NR.
+  * This is used in consensus genome and phylotree workflows to pull reference accessions (when CG run is kicked off from the mNGS report)
+  * Generating the JSON file for read alignment visualizations
+  * In postprocess step of mNGS (where we generate taxon_counts) this file is used to download accessions based on hit summary from diamond and minimap to create the "short DB" mentioned above.
+* nt_info.marisa and nr_info.marisa:
+  * index to quickly go from accession ID to accession name, sequence length, and offset information from either NT and NR
+  * mainly used to generate JSON files for coverage viz to be consumed by the web app.
+* Minimap indexes:
+  * chunked minimap index - used for non host alignment to generate hits to NT
+* diamond indexes:
+  * chunked diamond index - used for non host alignment to generate hits to NR
+* accession2taxid.marisa:
+  * index to quickly go from accession ID to taxon ID
+  * used for determine taxon assignment for each read from hits generated from minimap and diamond.
+* taxid-lineages.marisa:
+  * index used to go from tax ID to taxonomy IDs (taxid for species, genus, family)
+  * used for determining the optimal taxon assignment for each read from the alignment
+   results (calling hits), for example if a read aligns to multiple distinct references, we need to assess at which level in the taxonomic hierarchy the multiple alignments reach consensus.
+  * We also use this file for generating taxon counts in the postprocess step of mNGSs
+* deuterostome_taxids.txt - used to filter out eukaryotic sequences which helps narrow down taxon_counts to microbial DNA (bacteria, viruses, fungi, and parasites).
+* taxon_ignore_list.txt - taxa that we would like to ignore (synthetic, constructs, plasmids, vectors, etc) in taxon_counts from non host alignment
+
+
+#### **ncbi-compress**:
+compression code written in rust to remove redundant sequences from NT and NR
+
+#### **helpful_for_analysis**
+jupyter notebooks used for helpful common analysis steps including:
+* querying NCBI for an accession and lineage (used to investigate reads in the "all taxa with neither family nor genus classification" report for mNGS)
+* querying marisa trie files - notebook to easily query all marisa trie files generated from index generation workflow above.
+* compare non host alignment times between two projects - this was used to benchmark how long it took to do non host alignment on old and new indexes.
+* generate taxon lineage changelog for a sample report - get a readout on which reads from a sample report have a taxon / lineage change between new and old index runs. Used for comp bio validation purposes mainly.
+* checking sequence retention for different index compression runs - this notebook was handy for running multiple compression runs and summarizing which reads were dropped, helpful for early analysis and benchmarking of compression workflow.
+
+#### **ncbitax2lin.py**
+used to generate taxid-lineages.marisa
+
+#### **generate_accession2taxid.py**
+used to generate accession2taxis.marisa
+
+#### **generate_ncbi_db_index.py**
+used to generate nt_loc.marisa and nr_loc.marisa
+
+#### **generate_lineage_csvs.py**
+used to generate versioned-taxid-lineages.csv file used to populate taxon_lineage database table, also generates changelogs for deleted taxa, changed taxa, and new taxa.
+
+### Updating The Webapp to use the new Index Generation Assets:
+  * Follow the wiki page [here](https://github.com/chanzuckerberg/czid-web-private/wiki/%5BDEV%5D-How-to-update-the-webapp-to-use-new-index-assets-and-taxon_lineage-table-update) to update the webapp to use new assets and switch over all projects to start pinning the new index version.
+
+### Debugging Notes for EC2:
+* usually you need to launch an EC2 instance to test this workflow out at scale: `aegea launch --instance-type i3.16xlarge --no-provision-user --storage /=128GB --iam-role idseq-on-call <instance-name>`
+* install rust: `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh`
+* install python:
+  ```
+  curl -O https://repo.anaconda.com/archive/Anaconda3-2023.09-0-Linux-x86_64.sh
+  chmod u+x Anaconda3-2023.09-0-Linux-x86_64.sh
+  ./Anaconda3-2023.09-0-Linux-x86_64.sh
+  ```
+
+* if running miniwdl:
+     make sure the version is 1.5.1 (`pip3 install miniwdl==1.5.1`)
+    * downgrade importlib-metadata: `pip3 install importlib-metadata==4.13.0`
+
+* if building the images on an EC2 machine:
+    Follow [these instructions](https://docs.docker.com/engine/install/ubuntu/) to update docker before building
+
+    * change docker build directory to be on /mnt:
+        ```
+        sudo systemctl stop docker
+        sudo mv /var/lib/docker /mnt/docker
+        sudo ln -s /mnt/docker /var/lib/docker
+        sudo systemctl start docker
+        ```
+
+    * add the current user to the docker group:
+        * `sudo usermod -aG docker $USER`
+        * logout and login to reflect new group status
+
+* build index-generation docker image: `make rebuild WORKFLOW=index-generation`
+* run and exec into container: `docker run -v /mnt:/mnt --rm -it index-generation:latest bash`
+* to run a certain task in index-geneation.wdl with miniwdl:
+* `miniwdl run index-generation.wdl --task GenerateLocDB --input generate_loc_db_input.json`