Merge pull request #80 from naobservatory/single-read-raw-clean

Adding single-read functionality to RAW and CLEAN
naobservatory · Dec 20, 2024 · 3a4c2f1 · 3a4c2f1
2 parents fbeab45 + be30318
commit 3a4c2f1
Show file tree

Hide file tree

Showing 25 changed files with 451 additions and 70 deletions.
diff --git a/.github/workflows/end-to-end-se.yml b/.github/workflows/end-to-end-se.yml
@@ -0,0 +1,32 @@
+name: End-to-end MGS workflow test for single-end run
+
+on: [pull_request]
+
+jobs:
+  test-run-dev-se:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+
+      - name: Set up JDK 11
+        uses: actions/setup-java@v4
+        with:
+          java-version: '11'
+          distribution: 'adopt'
+
+      - name: Setup Nextflow latest (stable)
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "latest"
+
+      - name: Install nf-test
+        run: |
+          wget -qO- https://get.nf-test.com | bash
+          sudo mv nf-test /usr/local/bin/
+
+      - name: Run run_dev_se workflow
+        run: nf-test test --tag run_dev_se --verbose
diff --git a/.github/workflows/end-to-end.yml b/.github/workflows/end-to-end.yml
@@ -81,4 +81,4 @@ jobs:
           sudo mv nf-test /usr/local/bin/
 
       - name: Run run_validation workflow
-        run: nf-test test --tag validation --verbose
+        run: nf-test test --tag validation --verbose
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,4 @@ test/.nextflow*
 pipeline_report.txt
 
 .nf-test/
-.nf-test.log
+.nf-test.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+# v2.5.3 (in progress)
+- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing 
+- Began development of single-end read processing (still in progress)
+    - Restructured RAW, CLEAN, and QC workflows to handle both single-end and paired-end reads
+    - Added new FASTP_SINGLE and TRUNCATE_CONCAT_SINGLE processes to handle single-end reads
+    - Created separate end-to-end test workflow for single-end processing (which will be removed once single-end processing is fully integrated)
+    - Modified samplesheet handling to support both single-end and paired-end data
+    - Updated generate_samplesheet.sh to handle single-end data with --single_end flag
+    - Added read_type.config to handle single-end vs paired-end settings (set automatically based on samplesheet format)
+    - Created run_dev_se.config and run_dev_se.nf for single-end development testing (which will be removed once single-end processing is fully integrated)
+    - Added single-end samplesheet to test-data
+
 # v2.5.2
 - Changes to default read filtering:
     - Relaxed FASTP quality filtering (`--cut_mean_quality` and `--average_qual` reduced from 25 to 20).

diff --git a/bin/generate_samplesheet.sh b/bin/generate_samplesheet.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+
 set -u
 set -e
 
@@ -10,10 +11,28 @@ dir_path=""
 forward_suffix=""
 reverse_suffix=""
 s3=0
+single_end=0
 output_path="samplesheet.csv"  # Default output path
 group_file=""  # Optional parameter for the group file
 group_across_illumina_lanes=false
 
+# Function to print usage
+print_usage() {
+    echo "Usage:"
+    echo "For paired-end reads:"
+    echo "  $0 --dir_path <path> --forward_suffix <suffix> --reverse_suffix <suffix> [--s3] [--output_path <path>]"
+    echo "For single-end reads:"
+    echo "  $0 --dir_path <path> --single_end [--s3] [--output_path <path>]"
+    echo
+    echo "Options:"
+    echo "  --dir_path        Directory containing FASTQ files"
+    echo "  --forward_suffix  Suffix for forward reads (required for paired-end only)"
+    echo "  --reverse_suffix  Suffix for reverse reads (required for paired-end only)"
+    echo "  --single_end     Flag for single-end data"
+    echo "  --s3             Flag for S3 bucket access"
+    echo "  --output_path    Output path for samplesheet (default: samplesheet.csv)"
+}
+
 # Parse command-line arguments
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -33,10 +52,18 @@ while [[ $# -gt 0 ]]; do
             s3=1
             shift
             ;;
+        --single_end)
+            single_end=1
+            shift
+            ;;
         --output_path)
             output_path="$2"
             shift 2
             ;;
+        --help)
+            print_usage
+            exit 0
+            ;;
         --group_file)  # Optional group file
             group_file="$2"
             shift 2
@@ -47,20 +74,22 @@ while [[ $# -gt 0 ]]; do
             ;;
         *)
             echo "Unknown option: $1"
+            print_usage
             exit 1
             ;;
     esac
 done
 
 # Check if all required parameters are provided
-if [[ -z "$dir_path" || -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
-    echo "Error: dir_path, forward_suffix, and reverse_suffix are required."
+if [[ -z "$dir_path" || -z "$single_end" ]]; then
+    echo "Error: dir_path and single_end are required."
     echo -e "\nUsage: $0 [options]"
     echo -e "\nRequired arguments:"
     echo -e "  --dir_path <path>         Directory containing FASTQ files"
-    echo -e "  --forward_suffix <suffix>  Suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
-    echo -e "  --reverse_suffix <suffix>  Suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
+    echo -e "  --single_end              Flag for single-end data"
     echo -e "\nOptional arguments:"
+    echo -e "  --forward_suffix <suffix>  When single_end is 0, suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
+    echo -e "  --reverse_suffix <suffix>  When single_end is 0, suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
     echo -e "  --s3                      Use if files are stored in S3 bucket"
     echo -e "  --output_path <path>      Output path for samplesheet [default: samplesheet.csv]"
     echo -e "  --group_file <path>       Path to group file for sample grouping [header column must have the names 'sample,group' in that order; additional columns may be included, however they will be ignored by the script]"
@@ -74,15 +103,28 @@ if $group_across_illumina_lanes && [[ -n "$group_file" ]]; then
     exit 1
 fi
 
+if [ $single_end -eq 0 ]; then
+    # Paired-end validation
+    if [[ -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
+        echo "Error: forward_suffix and reverse_suffix are required for paired-end reads."
+        print_usage
+        exit 1
+    fi
+fi
+
 # Display the parameters
 echo "Parameters:"
 echo "dir_path: $dir_path"
-echo "forward_suffix: $forward_suffix"
-echo "reverse_suffix: $reverse_suffix"
+echo "single_end: $single_end"
 echo "s3: $s3"
 echo "output_path: $output_path"
 echo "group_file: $group_file"
 echo "group_across_illumina_lanes: $group_across_illumina_lanes"
+if [ $single_end -eq 0 ]; then
+    echo "forward_suffix: $forward_suffix"
+    echo "reverse_suffix: $reverse_suffix"
+fi
+
 
 
 #### EXAMPLES ####
@@ -109,30 +151,45 @@ echo "group_across_illumina_lanes: $group_across_illumina_lanes"
 # Create a temporary file for the initial samplesheet
 temp_samplesheet=$(mktemp)
 
-echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
+# Create header based on single_end flag
+if [ $single_end -eq 0 ]; then
+    echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
+else
+    echo "sample,fastq" > "$temp_samplesheet"
+fi
+echo "group_file: $group_file"
 
 
 # Ensure dir_path ends with a '/'
 if [[ "$dir_path" != */ ]]; then
     dir_path="${dir_path}/"
 fi
 
-listing=0
-
+# Get file listing based on s3 flag
 if [ $s3 -eq 1 ]; then
     listing=$(aws s3 ls ${dir_path} | awk '{print $4}')
 else
     listing=$(ls ${dir_path} | awk '{print $1}')
 fi
 
-echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
-    sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
-    reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
-    # If sample + reverse_suffix exists in s3_listing, then add to samplesheet
-    if [ -n "$reverse_read" ]; then
-        echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
-    fi
-done
+# Process files based on single_end flag
+if [ $single_end -eq 0 ]; then
+    # Paired-end processing
+    echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
+        sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
+        reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
+        # If sample + reverse_suffix exists in s3_listing, then add to samplesheet
+        if [ -n "$reverse_read" ]; then
+            echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
+        fi
+    done
+else
+    # Single-end processing - just process all fastq.gz files
+    echo "$listing" | grep "\.fastq\.gz$" | while read -r read_file; do
+        sample=$(echo "$read_file" | sed -E "s/\.fastq\.gz$//")
+        echo "$sample,${dir_path}${read_file}" >> "$temp_samplesheet"
+    done
+fi
 
 # Check if group file is provided
 if [[ -n "$group_file" ]]; then

diff --git a/configs/read_type.config b/configs/read_type.config
@@ -0,0 +1,6 @@
+// Universal flags for read type (single-end vs paired-end)
+
+params {
+    // Whether the underlying data is paired-end or single-end
+    single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
+}
diff --git a/configs/run.config b/configs/run.config
@@ -7,7 +7,7 @@ params {
 
     // Directories
     base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
 
     // Files
     sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
@@ -31,4 +31,5 @@ includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
 includeConfig "${projectDir}/configs/output.config"
+includeConfig "${projectDir}/configs/read_type.config"
 process.queue = "will-batch-queue" // AWS Batch job queue
diff --git a/configs/run_dev_se.config b/configs/run_dev_se.config
@@ -0,0 +1,34 @@
+/************************************************
+| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
+************************************************/
+
+params {
+    mode = "run_dev_se"
+
+    // Directories
+    base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
+    ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
+
+    // Files
+    sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
+    adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming
+
+    // Numerical
+    grouping = false // Whether to group samples by 'group' column in samplesheet
+    n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
+    n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
+    bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
+    blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
+    kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
+    quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
+    fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
+    host_taxon = "vertebrate"
+}
+
+includeConfig "${projectDir}/configs/logging.config"
+includeConfig "${projectDir}/configs/containers.config"
+includeConfig "${projectDir}/configs/resources.config"
+includeConfig "${projectDir}/configs/profiles.config"
+includeConfig "${projectDir}/configs/output.config"
+includeConfig "${projectDir}/configs/read_type.config"
+process.queue = "simon-batch-queue" // AWS Batch job queue
diff --git a/configs/run_validation.config b/configs/run_validation.config
@@ -7,7 +7,7 @@ params {
 
     // Directories
     base_dir = "s3://nao-mgs-wb/test-remote" // Parent for working and output directories (can be S3)
-    ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
+    ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)
 
     // Files
     viral_tsv_collapsed = "${base_dir}/results/virus_hits_db.tsv.gz"
@@ -24,4 +24,5 @@ includeConfig "${projectDir}/configs/containers.config"
 includeConfig "${projectDir}/configs/resources.config"
 includeConfig "${projectDir}/configs/profiles.config"
 includeConfig "${projectDir}/configs/output.config"
+includeConfig "${projectDir}/configs/read_type.config"
 process.queue = "will-batch-queue" // AWS Batch job queue
diff --git a/main.nf b/main.nf
@@ -1,6 +1,7 @@
 include { RUN } from "./workflows/run"
 include { RUN_VALIDATION } from "./workflows/run_validation"
 include { INDEX } from "./workflows/index"
+include { RUN_DEV_SE } from "./workflows/run_dev_se"
 
 workflow {
     if (params.mode == "index") {
@@ -9,6 +10,8 @@ workflow {
         RUN()
     } else if (params.mode == "run_validation") {
         RUN_VALIDATION()
+    } else if (params.mode == "run_dev_se") {
+        RUN_DEV_SE()
     }
 }
 

diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf
@@ -1,4 +1,4 @@
-process FASTP {
+process FASTP_PAIRED {
     label "max"
     label "fastp"
     input:
@@ -32,6 +32,41 @@ process FASTP {
         '''
 }
 
+process FASTP_SINGLE {
+    label "max"
+    label "fastp"
+    input:
+        // reads is a list of two files: forward/reverse reads
+        tuple val(sample), path(reads)
+        path(adapters)
+    output:
+        tuple val(sample), path("${sample}_fastp.fastq.gz"), emit: reads
+        tuple val(sample), path("${sample}_fastp_failed.fastq.gz"), emit: failed
+        tuple val(sample), path("${sample}_fastp.{json,html}"), emit: log
+    shell:
+        /* Cleaning not done in CUTADAPT or TRIMMOMATIC:
+        * Higher quality threshold for sliding window trimming;
+        * Removing poly-X tails;
+        * Automatic adapter detection;
+        * Base correction in overlapping paired-end reads;
+        * Filter low complexity reads.
+        */
+        '''
+        # Define paths and subcommands
+        of=!{sample}_fastp_failed.fastq.gz
+        oj=!{sample}_fastp.json
+        oh=!{sample}_fastp.html
+        ad=!{adapters}
+        o=!{sample}_fastp.fastq.gz
+        io="--in1 !{reads[0]} --out1 ${o} --failed_out ${of} --html ${oh} --json ${oj} --adapter_fasta ${ad}"
+        par="--cut_front --cut_tail --correction --detect_adapter_for_pe --trim_poly_x --cut_mean_quality 20 --average_qual 20 --qualified_quality_phred 20 --verbose --dont_eval_duplication --thread !{task.cpus} --low_complexity_filter"
+        # Execute
+        fastp ${io} ${par}
+        '''
+}
+
+
+
 // Run FASTP for adapter trimming but don't trim for quality
 process FASTP_NOTRIM {
     label "max"
@@ -66,3 +101,4 @@ process FASTP_NOTRIM {
         fastp ${io} ${par}
         '''
 }
+
diff --git a/modules/local/summarizeMultiqcPair/main.nf b/modules/local/summarizeMultiqcPair/main.nf
@@ -4,10 +4,11 @@ process SUMMARIZE_MULTIQC_PAIR {
     label "single"
     input:
         tuple val(stage), val(sample), path(multiqc_data)
+        val(single_end)
     output:
         tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz")
     shell:
         '''
-        summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -o ${PWD}
+        summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
         '''
-}
+}