Merge branch 'dev' into single-read-raw-clean

naobservatory · Nov 29, 2024 · 7899979 · 7899979
2 parents 3d10bb0 + b75ddc6
commit 7899979
Show file tree

Hide file tree

Showing 72 changed files with 675 additions and 162 deletions.
diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
@@ -0,0 +1,46 @@
+name: End-to-end MGS workflow test
+
+on: [pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up JDK 11
+        uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: 'adopt'
+
+      - name: Setup Nextflow latest-edge
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "latest-edge"
+
+      - name: Install nf-test
+        run: |
+          wget -qO- https://get.nf-test.com | bash
+          sudo mv nf-test /usr/local/bin/
+
+      - name: Run index workflow
+        run: nf-test test --tag index --verbose
+
+      - name: Clean docker for more space
+        run: |
+          docker kill $(docker ps -q) 2>/dev/null || true
+          docker rm $(docker ps -a -q) 2>/dev/null || true
+          docker rmi $(docker images -q) -f 2>/dev/null || true
+          docker system prune -af --volumes
+
+      - name: Clean up nf-test dir
+        run: sudo rm -rf .nf-test
+
+      - name: Run run workflow
+        run: nf-test test --tag run --verbose
+
+      - name: Run run_validation workflow
+        run: nf-test test --tag validation --verbose
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,7 @@ test/work
 test/output
 test/.nextflow*
 *.Rhistory
-pipeline_report.txt
+pipeline_report.txt
+
+.nf-test/
+.nf-test.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,28 @@
-# v2.5.2 (in progress)
-- Relaxed FASTP quality filtering (`--cut_mean_quality` and `--average_qual` reduced from 25 to 20).
-- Relaxed BBDUK viral filtering (switched from 3 21-mers to 1 24-mer).
+# v2.5.2
+- Changes to default read filtering:
+    - Relaxed FASTP quality filtering (`--cut_mean_quality` and `--average_qual` reduced from 25 to 20).
+    - Relaxed BBDUK viral filtering (switched from 3 21-mers to 1 24-mer).
 - Overhauled BLAST validation functionality:
     - BLAST now runs on forward and reverse reads independently
     - BLAST output filtering no longer assumes specific filename suffixes
     - Paired BLAST output includes more information
     - RUN_VALIDATION can now directly take in FASTA files instead of a virus read DB
     - Fixed issues with publishing BLAST output under new Nextflow version
-- Removed redundant subsetting statement from TAXONOMY workflow.
-- Added --group_across_illumina_lanes option to generate_samplesheet
+- Implemented nf-test for end-to-end testing of pipeline functionality
+    - Implemented test suite in `tests/main.nf.test`
+    - Reconfigured INDEX workflow to enable generation of miniature index directories for testing
+    - Added Github Actions workflow in `.github/workflows/end-to-end.yml`
+    - Pull requests will now fail if any of INDEX, RUN, or RUN_VALIDATION crashes when run on test data.
+    - Generated first version of new, curated test dataset for testing RUN workflow. Samplesheet and config file are available in `test-data`. The previous test dataset in `test` has been removed.
+- Implemented S3 auto-cleanup:
+    - Added tags to published files to facilitate S3 auto-cleanup
+    - Added S3 lifecycle configuration file to `ref`, along with a script in `bin` to add it to an S3 bucket
+- Minor changes
+    - Added logic to check if `grouping` variable in `nextflow.config` matches the input samplesheet, if it doesn't, the code throws an error.
+    - Externalized resource specifications to `resources.config`, removing hardcoded CPU/memory values
+    - Renamed `index-params.json` to `params-index.json` to avoid clash with Github Actions
+    - Removed redundant subsetting statement from TAXONOMY workflow.
+    - Added --group_across_illumina_lanes option to generate_samplesheet
 
 # v2.5.1
 - Enabled extraction of BBDuk-subset putatively-host-viral raw reads for downstream chimera detection.

diff --git a/README.md b/README.md
@@ -179,6 +179,7 @@ To run this workflow with full functionality, you need access to the following d
 2. **Docker:** To install Docker Engine for command-line use, follow the installation instructions available [here](https://docs.docker.com/engine/install/) (or [here](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/install-docker.html) for installation on an AWS EC2 instance).
 3. **AWS CLI:** If not already installed, install the AWS CLI by following the instructions available [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
 4. **Git:** To install the Git version control tool, follow the installation instructions available [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+5. **nf-test**: To install nf-test, follow the install instructions available [here](https://www.nf-test.com/docs/getting-started/).
 
 #### 2. Configure AWS & Docker
 
@@ -245,25 +246,33 @@ Wait for the workflow to run to completion; this is likely to take several hours
 
 ### Testing & validation
 
-To confirm that the pipeline works in your hands, we provide a small test dataset (`test/raw`) to run through the run workflow. This can be used to test any of the pipeline profiles described above.
+To confirm that the pipeline works in your hands, we provide a small test dataset (`s3://nao-testing/gold-standard-test/raw/`) to run through the run workflow. This can be used to test any of the pipeline profiles described above.
 
 If your EC2 instance has the resources to handle it, the simplest way to start using the pipeline is to run the test data through it locally on that instance (i.e. without using S3). To do this:
 
-1. Navigate to the `test` directory.
-2. Edit `nextflow.config` to set `params.ref_dir` to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`).
-3. Still within the `test` directory, run `nextflow run -profile ec2_local .. -resume`.
-4. Wait for the workflow to finish. Inspect the `output` directory to view the processed output files.
+1. Create a new directory outside the repo directory and copy over the run workflow config file as `nextflow.config` in that directory:
+
+```
+mkdir launch
+cd launch
+cp REPO_DIR/configs/run.config nextflow.config
+```
+
+2. Edit `nextflow.config` to set `params.ref_dir` to the index directory you chose or created above (specifically `PATH_TO_REF_DIR/output`). 
+3. Then set the samplesheet path to the test dataset samplesheet `${projectDir}/test-data/samplesheet.csv`. 
+4. Within this directory, run `nextflow run -profile ec2_local .. -resume`. Wait for the workflow to finish. 
+5. Inspect the `output` directory to view the processed output files.
 
 If this is successful, the next level of complexity is to run the workflow with a working directory on S3. To do this:
 
-1. Within the `test` directory, edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice.
-2. Still within that directory, run `nextflow run -profile ec2_s3 .. -resume`.
+1. Edit `nextflow.config` to set `params.base_dir` to the S3 directory of your choice. 
+2. Still within that directory, run `nextflow run -profile ec2_s3 .. -resume`. 
 3. Wait for the workflow to finish, and inspect the output on S3.
 
 Finally, you can run the test dataset through the pipeline on AWS Batch. To do this, configure Batch as described [here](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html) (steps 1-3), then:
 
-1. Within the `test` directory, edit `nextflow.config` to set `params.base_dir` to a different S3 directory of your choice and `process.queue` to the name of your Batch job queue.
-2. Still within that directory, run `nextflow run -profile batch .. -resume` (or simply `nextflow run .. -resume`).
+1. Edit `nextflow.config` to set `params.base_dir` to a different S3 directory of your choice and `process.queue` to the name of your Batch job queue. 
+2. Still within that directory, run `nextflow run -profile batch .. -resume` (or simply `nextflow run .. -resume`). 
 3. Wait for the workflow to finish, and inspect the output on S3.
 
 ### Running on new data
@@ -304,6 +313,25 @@ If running on Batch, a good process for starting the pipeline on a new dataset i
 5. Run `nextflow run PATH_TO_REPO_DIR -resume`.
 6. Navigate to `{params.base_dir}/output` to view and download output files.
 
+## Run tests using `nf-test` before making pull requests
+
+During the development process, we now request that users run the pipeline using `nf-test` locally before making pull requests (a test will be run automatically on the PR, but it's often useful to run it locally first). To do this, you need to make sure that you have a big enough ec2-instance. We recommend the `m5.xlarge` with at least `32GB` of EBS storage, as this machine closely reflects the VMs on Github Actions. Once you have an instance, run `nf-test run tests/main.test.nf`, which will run all workflows of the pipeline and check that they run to completion. If you want to run a specific workflow, you use the following commands:
+
+```
+nf-test run --tag index  # Runs the index workflow
+nf-test run --tag run     # Runs the run workflow
+nf-test run --tag validation # Runs the validation workflow
+```
+
+Importantly, make sure to periodically delete docker images to free up space on your instance. You can do this by running the following command, although note that this will delete all docker images:
+
+```
+docker kill $(docker ps -q) 2>/dev/null || true
+docker rm $(docker ps -a -q) 2>/dev/null || true
+docker rmi $(docker images -q) -f 2>/dev/null || true
+docker system prune -af --volumes
+```
+
 # Troubleshooting
 
 When attempting to run a released version of the pipeline, the most common sources of errors are AWS permission issues. Before debugging a persistent error in-depth, make sure that you have all the permissions specified in Step 0 of [our Batch workflow guide](https://data.securebio.org/wills-public-notebook/notebooks/2024-06-11_batch.html). Next, make sure Nextflow has access to your AWS credentials, such as by running `eval "$(aws configure export-credentials --format env)"`.

diff --git a/bin/apply-lifecycle-rules.py b/bin/apply-lifecycle-rules.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import boto3
+import sys
+from botocore.exceptions import ClientError
+
+def load_lifecycle_config(config_path):
+    try:
+        with open(config_path, 'r') as f:
+            return json.load(f)
+    except json.JSONDecodeError:
+        print(f"Error: {config_path} contains invalid JSON")
+        sys.exit(1)
+    except FileNotFoundError:
+        print(f"Error: Could not find file {config_path}")
+        sys.exit(1)
+
+def print_lifecycle_rules(rules):
+    if not rules:
+        print("No lifecycle rules configured")
+        return
+
+    for rule in rules:
+        print(f"- {rule['ID']}")
+        print(f"  Status: {rule['Status']}")
+        if 'Expiration' in rule:
+            print(f"  Expiration: {rule['Expiration'].get('Days', 'N/A')} days")
+        print()
+
+def get_current_rules(s3, bucket_name):
+    try:
+        response = s3.get_bucket_lifecycle_configuration(Bucket=bucket_name)
+        return response.get('Rules', [])
+    except ClientError as e:
+        if e.response['Error']['Code'] == 'NoSuchLifecycleConfiguration':
+            return []
+        raise
+
+def apply_lifecycle_rules(bucket_name, lifecycle_config):
+    s3 = boto3.client('s3')
+
+    try:
+        # First verify the bucket exists and we have access
+        s3.head_bucket(Bucket=bucket_name)
+
+        # Show current configuration
+        print(f"\nCurrent lifecycle rules for bucket {bucket_name}:")
+        current_rules = get_current_rules(s3, bucket_name)
+        print_lifecycle_rules(current_rules)
+
+        # Apply the new configuration
+        s3.put_bucket_lifecycle_configuration(
+            Bucket=bucket_name,
+            LifecycleConfiguration=lifecycle_config
+        )
+        print(f"\nSuccessfully applied new lifecycle rules to bucket: {bucket_name}")
+
+        # Show the updated configuration
+        print("\nUpdated lifecycle rules:")
+        new_rules = get_current_rules(s3, bucket_name)
+        print_lifecycle_rules(new_rules)
+
+    except ClientError as e:
+        error_code = e.response.get('Error', {}).get('Code', 'Unknown')
+        if error_code == '404':
+            print(f"Error: Bucket {bucket_name} does not exist")
+        elif error_code == '403':
+            print(f"Error: Permission denied for bucket {bucket_name}")
+        else:
+            print(f"Error applying lifecycle rules: {str(e)}")
+        sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description='Apply S3 lifecycle rules to a bucket')
+    parser.add_argument('config_file', help='Path to lifecycle configuration JSON file')
+    parser.add_argument('bucket_name', help='Name of the S3 bucket')
+
+    args = parser.parse_args()
+
+    # Load the configuration
+    lifecycle_config = load_lifecycle_config(args.config_file)
+
+    # Apply the rules
+    apply_lifecycle_rules(args.bucket_name, lifecycle_config)
+
+if __name__ == '__main__':
+    main()
diff --git a/configs/containers.config b/configs/containers.config
@@ -1,12 +1,22 @@
 // Specify Docker containers for workflow processes
 process {
-    withLabel: base {
-        container = "eclipse/alpine_jdk8:latest"
-        // NB: As of 2024-07-01, no more specific tag available
+    withLabel: curl {
+        container = "community.wave.seqera.io/library/curl:8.10.1--43150f2d543ef413"
+    }
+    withLabel: unzip {
+        container = "community.wave.seqera.io/library/unzip:6.0--0e729f0c20458893"
+    }
+    withLabel: coreutils {
+        container = "community.wave.seqera.io/library/coreutils:9.5--ae99c88a9b28c264"
+    }
+    withLabel: coreutils_gzip_gawk {
+        container = "community.wave.seqera.io/library/coreutils_gawk_gzip:c49bfad0a858f99a"
     }
     withLabel: MultiQC {
         // NB: As of 2024-07-01, newer versions currently cause errors
-        container = "multiqc/multiqc:v1.21"
+//        container = "multiqc/multiqc:v1.21"
+//        container = "staphb/multiqc:1.22.2"
+        container = "thatdnaguy/multiqc:v1.21_01"
     }
     withLabel: FASTQC {
         container = "staphb/fastqc:0.12.1"

diff --git a/configs/index-for-run-test.config b/configs/index-for-run-test.config
@@ -0,0 +1,55 @@
+/***********************************************************************
+| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW - REFERENCES & INDEXES |
+***********************************************************************/
+
+params {
+    mode = "index"
+
+    // Directories
+    base_dir = "s3://nao-testing/index-test" // Parent for working and output directories (can be S3)
+
+    // URLs for downloading reference genomes etc
+    taxonomy_url = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/taxdmp_2024-06-01.zip"
+    virus_host_db_url = "https://www.genome.jp/ftp/db/virushostdb/virushostdb.tsv"
+
+    // 21st chromosome
+    human_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_000021.9&rettype=fasta"
+
+    // Look up genome assembly ncbi
+    genome_urls = [
+        cow_ch28: "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_037355.1&rettype=fasta",
+        ecoli: "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NC_002695.2&rettype=fasta"
+    ]
+
+    ssu_url = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz"
+    lsu_url = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz"
+
+    // Other reference files
+    host_taxon_db = "${projectDir}/ref/host-taxa.tsv"
+    contaminants = "${projectDir}/ref/contaminants.fasta.gz"
+    genome_patterns_exclude =  "${projectDir}/ref/hv_patterns_exclude.txt"
+
+    // Kraken viral DB
+    kraken_db = "https://genome-idx.s3.amazonaws.com/kraken/k2_viral_20240904.tar.gz"
+    // Smallest possible BLAST DB
+    blast_db_name = "nt_others"
+
+    // Pull information from GenBank or Ref Seq
+    ncbi_viral_params = "--section refseq --assembly-level complete"
+
+    // Other input values
+    virus_taxid = "10239"
+    viral_taxids_exclude = "2731619 2732413 2732411" // Exclude Caudoviricetes, Malgrantaviricetes, Faserviricetes
+    host_taxa_screen = "vertebrate human" // Host taxa to screen for when building reference virus DB
+
+    // Initializing run params to avoid warnings
+    kraken_memory = ""
+    classify_dedup_subset = ""
+}
+
+includeConfig "${projectDir}/configs/logging.config"
+includeConfig "${projectDir}/configs/containers.config"
+includeConfig "${projectDir}/configs/resources.config"
+includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/output.config"
+process.queue = "harmon-queue" // AWS Batch job queue
diff --git a/configs/index.config b/configs/index.config
@@ -12,11 +12,15 @@ params {
     taxonomy_url = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_archive/taxdmp_2024-06-01.zip"
     virus_host_db_url = "https://www.genome.jp/ftp/db/virushostdb/virushostdb.tsv"
     human_url = "https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/analysis_set/chm13v2.0.fa.gz"
-    cow_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/263/795/GCF_002263795.3_ARS-UCD2.0/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz"
-    pig_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/003/025/GCF_000003025.6_Sscrofa11.1/GCF_000003025.6_Sscrofa11.1_genomic.fna.gz"
-    carp_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/951/615/GCF_000951615.1_common_carp_genome/GCF_000951615.1_common_carp_genome_genomic.fna.gz"
-    mouse_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz"
-    ecoli_url = "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/030/389/925/GCF_030389925.1_ASM3038992v1/GCF_030389925.1_ASM3038992v1_genomic.fna.gz"
+
+    genome_urls = [
+        cow: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/263/795/GCF_002263795.3_ARS-UCD2.0/GCF_002263795.3_ARS-UCD2.0_genomic.fna.gz",
+        pig: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/003/025/GCF_000003025.6_Sscrofa11.1/GCF_000003025.6_Sscrofa11.1_genomic.fna.gz",
+        carp: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/951/615/GCF_000951615.1_common_carp_genome/GCF_000951615.1_common_carp_genome_genomic.fna.gz",
+        mouse: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz",
+        ecoli: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/030/389/925/GCF_030389925.1_ASM3038992v1/GCF_030389925.1_ASM3038992v1_genomic.fna.gz"
+    ]
+
     ssu_url = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_SSURef_NR99_tax_silva.fasta.gz"
     lsu_url = "https://www.arb-silva.de/fileadmin/silva_databases/release_138.1/Exports/SILVA_138.1_LSURef_NR99_tax_silva.fasta.gz"
 
@@ -25,6 +29,8 @@ params {
     contaminants = "${projectDir}/ref/contaminants.fasta.gz"
     genome_patterns_exclude =  "${projectDir}/ref/hv_patterns_exclude.txt"
     kraken_db = "s3://genome-idx/kraken/k2_standard_20240605.tar.gz" // Path to tarball containing Kraken reference DB
+    blast_db_name = "core_nt"
+    ncbi_viral_params = "--section genbank"
 
     // Other input values
     virus_taxid = "10239"