Merge branch 'dev' into single-read-profile

naobservatory · Dec 21, 2024 · ae1fe8a · ae1fe8a
2 parents 6b1da71 + ab616c0
commit ae1fe8a
Show file tree

Hide file tree

Showing 17 changed files with 82 additions and 4 deletions.
diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
@@ -56,6 +56,33 @@ jobs:
       - name: Run run workflow
         run: nf-test test --tag run --verbose
 
+  test-run-output:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up JDK 11
+        uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: 'adopt'
+
+      - name: Setup Nextflow latest (stable)
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "latest"
+
+      - name: Install nf-test
+        run: |
+          wget -qO- https://get.nf-test.com | bash
+          sudo mv nf-test /usr/local/bin/
+
+      - name: Run run workflow
+        run: nf-test test --tag run_output --verbose
+
   test-validation:
     runs-on: ubuntu-latest
     timeout-minutes: 5

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,15 @@
 # v2.5.3 (in progress)
 - Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing
+- Testing infrastructure
+  - Split up the tests in `End-to-end MGS workflow test` so that they can be run in parallel on Github Actions.
+  - Implemented an end-to-end test that checks if the RUN workflow produces the correct output. The correct output for the test has been saved in `test-data/gold-standard-results` so that the user can diff the output of their test with the correct output to check where their pipeline might be failing.
 - Began development of single-end read processing (still in progress)
     - Restructured RAW, CLEAN, QC, TAXONOMY, and PROFILE workflows to handle both single-end and paired-end reads
     - Added new FASTP_SINGLE, TRUNCATE_CONCAT_SINGLE, BBDUK_SINGLE, CONCAT_GROUP_SINGLE, SUBSET_READS_SINGLE and SUBSET_READS_SINGLE_TARGET processes to handle single-end reads
     - Created separate end-to-end test workflow for single-end processing (which will be removed once single-end processing is fully integrated)
     - Modified samplesheet handling to support both single-end and paired-end data
     - Updated generate_samplesheet.sh to handle single-end data with --single_end flag
+    - 
     - Added read_type.config to handle single-end vs paired-end settings (set automatically based on samplesheet format)
     - Created run_dev_se.config and run_dev_se.nf for single-end development testing (which will be removed once single-end processing is fully integrated)
     - Added single-end samplesheet to test-data

diff --git a/README.md b/README.md
@@ -318,11 +318,14 @@ If running on Batch, a good process for starting the pipeline on a new dataset i
 During the development process, we now request that users run the pipeline using `nf-test` locally before making pull requests (a test will be run automatically on the PR, but it's often useful to run it locally first). To do this, you need to make sure that you have a big enough ec2-instance. We recommend the `m5.xlarge` with at least `32GB` of EBS storage, as this machine closely reflects the VMs on Github Actions. Once you have an instance, run `nf-test run tests/main.test.nf`, which will run all workflows of the pipeline and check that they run to completion. If you want to run a specific workflow, you use the following commands:
 
 ```
-nf-test run --tag index  # Runs the index workflow
-nf-test run --tag run     # Runs the run workflow
-nf-test run --tag validation # Runs the validation workflow
+nf-test test --tag index  # Runs the index workflow
+nf-test test --tag run     # Runs the run workflow
+nf-test test --tag validation # Runs the validation workflow
+nf-test test --tag run_output # Runs the run workflow with the test that verifies that the output files are correct
 ```
 
+The intended results for the run workflow can be found in following directory `test-data/gold-standard-results`. Should the `run_output` test fail, you can diff the resulting files of that test, with the files in this folder to find the differences.
+
 Importantly, make sure to periodically delete docker images to free up space on your instance. You can do this by running the following command, although note that this will delete all docker images:
 
 ```

diff --git a/test-data/gold-standard-results/blast_hits_paired.tsv.gz b/test-data/gold-standard-results/blast_hits_paired.tsv.gz
diff --git a/test-data/gold-standard-results/bracken_reports_merged.tsv.gz b/test-data/gold-standard-results/bracken_reports_merged.tsv.gz
diff --git a/test-data/gold-standard-results/kraken_reports_merged.tsv.gz b/test-data/gold-standard-results/kraken_reports_merged.tsv.gz
diff --git a/test-data/gold-standard-results/qc_adapter_stats.tsv.gz b/test-data/gold-standard-results/qc_adapter_stats.tsv.gz
diff --git a/test-data/gold-standard-results/qc_basic_stats.tsv.gz b/test-data/gold-standard-results/qc_basic_stats.tsv.gz
diff --git a/test-data/gold-standard-results/qc_quality_base_stats.tsv.gz b/test-data/gold-standard-results/qc_quality_base_stats.tsv.gz
diff --git a/test-data/gold-standard-results/qc_quality_sequence_stats.tsv.gz b/test-data/gold-standard-results/qc_quality_sequence_stats.tsv.gz
diff --git a/test-data/gold-standard-results/virus_clade_counts.tsv.gz b/test-data/gold-standard-results/virus_clade_counts.tsv.gz
diff --git a/test-data/gold-standard-results/virus_hits_1.fasta.gz b/test-data/gold-standard-results/virus_hits_1.fasta.gz
diff --git a/test-data/gold-standard-results/virus_hits_2.fasta.gz b/test-data/gold-standard-results/virus_hits_2.fasta.gz
diff --git a/test-data/gold-standard-results/virus_hits_db.tsv.gz b/test-data/gold-standard-results/virus_hits_db.tsv.gz
diff --git a/tests/main.nf.test b/tests/main.nf.test
@@ -36,4 +36,26 @@ nextflow_pipeline {
         }
     }
 
+    test("Test run workflow output") {
+        config "tests/run.config"
+        tag "run_output"
+
+        then {
+            assert workflow.success
+            assert snapshot(
+                path("${launchDir}/output/results/bracken_reports_merged.tsv.gz"),
+                path("${launchDir}/output/results/kraken_reports_merged.tsv.gz"),
+                path("${launchDir}/output/results/qc_adapter_stats.tsv.gz"),
+                path("${launchDir}/output/results/qc_basic_stats.tsv.gz"),
+                path("${launchDir}/output/results/qc_quality_base_stats.tsv.gz"),
+                path("${launchDir}/output/results/qc_quality_sequence_stats.tsv.gz"),
+                path("${launchDir}/output/results/virus_clade_counts.tsv.gz"),
+                path("${launchDir}/output/results/virus_hits_db.tsv.gz"),
+                path("${launchDir}/output/results/blast_hits_paired.tsv.gz"),
+                path("${launchDir}/output/results/virus_hits_1.fasta.gz"),
+                path("${launchDir}/output/results/virus_hits_2.fasta.gz"),
+            ).match("run_output")
+        }
+
+    }
 }
diff --git a/tests/main.nf.test.snap b/tests/main.nf.test.snap
@@ -0,0 +1,22 @@
+{
+    "run_output": {
+        "content": [
+            "bracken_reports_merged.tsv.gz:md5,d116fc06ea955409463686df223ec413",
+            "kraken_reports_merged.tsv.gz:md5,5d04e8270bfddc7f435bcc0a76bafbcb",
+            "qc_adapter_stats.tsv.gz:md5,a6441f841e6e1743067f6eb1bb68a3ef",
+            "qc_basic_stats.tsv.gz:md5,fd1a20ad7bb06eecaaa9cc51b81efa8d",
+            "qc_quality_base_stats.tsv.gz:md5,1e75fe721479280f83d7e2f8c9fa975b",
+            "qc_quality_sequence_stats.tsv.gz:md5,0d835c1d9417df961b4aadd7ac42137d",
+            "virus_clade_counts.tsv.gz:md5,3fcf57def952c95bf43cad8cfb268926",
+            "virus_hits_db.tsv.gz:md5,2232a0cfa4a2812621bb852c276ef9a1",
+            "blast_hits_paired.tsv.gz:md5,f2d110487d36ca58b1232f6d02aa69af",
+            "virus_hits_1.fasta.gz:md5,6024105b15a879de8616e54cb2dc3bb0",
+            "virus_hits_2.fasta.gz:md5,a08a14e81a6277d56ec16d04f3be7b8e"
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.1"
+        },
+        "timestamp": "2024-12-20T21:16:59.561007756"
+    }
+}
diff --git a/tests/run_validation.config b/tests/run_validation.config
@@ -10,7 +10,7 @@ params {
     ref_dir = "s3://nao-testing/index-test/output" // Reference/index directory (generated by index workflow)
 
     // Files
-    viral_tsv_collapsed = "s3://nao-testing/gold-standard-test/results/hv/hv_hits_putative_collapsed.tsv.gz"
+    viral_tsv_collapsed = "${projectDir}/test-data/gold-standard-results/virus_hits_db.tsv.gz"
     viral_fasta_1 = ""
     viral_fasta_2 = ""