Skip to content

Commit

Permalink
Merge pull request #80 from naobservatory/single-read-raw-clean
Browse files Browse the repository at this point in the history
Adding single-read functionality to RAW and CLEAN
  • Loading branch information
willbradshaw authored Dec 20, 2024
2 parents fbeab45 + be30318 commit 3a4c2f1
Show file tree
Hide file tree
Showing 25 changed files with 451 additions and 70 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/end-to-end-se.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: End-to-end MGS workflow test for single-end run

on: [pull_request]

jobs:
test-run-dev-se:
runs-on: ubuntu-latest
timeout-minutes: 10

steps:
- name: Checkout
uses: actions/checkout@v4


- name: Set up JDK 11
uses: actions/setup-java@v4
with:
java-version: '11'
distribution: 'adopt'

- name: Setup Nextflow latest (stable)
uses: nf-core/setup-nextflow@v1
with:
version: "latest"

- name: Install nf-test
run: |
wget -qO- https://get.nf-test.com | bash
sudo mv nf-test /usr/local/bin/
- name: Run run_dev_se workflow
run: nf-test test --tag run_dev_se --verbose
2 changes: 1 addition & 1 deletion .github/workflows/end-to-end.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,4 @@ jobs:
sudo mv nf-test /usr/local/bin/
- name: Run run_validation workflow
run: nf-test test --tag validation --verbose
run: nf-test test --tag validation --verbose
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ test/.nextflow*
pipeline_report.txt

.nf-test/
.nf-test.log
.nf-test.log
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# v2.5.3 (in progress)
- Added new LOAD_SAMPLESHEET subworkflow to centralize samplesheet processing
- Began development of single-end read processing (still in progress)
- Restructured RAW, CLEAN, and QC workflows to handle both single-end and paired-end reads
- Added new FASTP_SINGLE and TRUNCATE_CONCAT_SINGLE processes to handle single-end reads
- Created separate end-to-end test workflow for single-end processing (which will be removed once single-end processing is fully integrated)
- Modified samplesheet handling to support both single-end and paired-end data
- Updated generate_samplesheet.sh to handle single-end data with --single_end flag
- Added read_type.config to handle single-end vs paired-end settings (set automatically based on samplesheet format)
- Created run_dev_se.config and run_dev_se.nf for single-end development testing (which will be removed once single-end processing is fully integrated)
- Added single-end samplesheet to test-data

# v2.5.2
- Changes to default read filtering:
- Relaxed FASTP quality filtering (`--cut_mean_quality` and `--average_qual` reduced from 25 to 20).
Expand Down
91 changes: 74 additions & 17 deletions bin/generate_samplesheet.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash


set -u
set -e

Expand All @@ -10,10 +11,28 @@ dir_path=""
forward_suffix=""
reverse_suffix=""
s3=0
single_end=0
output_path="samplesheet.csv" # Default output path
group_file="" # Optional parameter for the group file
group_across_illumina_lanes=false

# Function to print usage
print_usage() {
echo "Usage:"
echo "For paired-end reads:"
echo " $0 --dir_path <path> --forward_suffix <suffix> --reverse_suffix <suffix> [--s3] [--output_path <path>]"
echo "For single-end reads:"
echo " $0 --dir_path <path> --single_end [--s3] [--output_path <path>]"
echo
echo "Options:"
echo " --dir_path Directory containing FASTQ files"
echo " --forward_suffix Suffix for forward reads (required for paired-end only)"
echo " --reverse_suffix Suffix for reverse reads (required for paired-end only)"
echo " --single_end Flag for single-end data"
echo " --s3 Flag for S3 bucket access"
echo " --output_path Output path for samplesheet (default: samplesheet.csv)"
}

# Parse command-line arguments
while [[ $# -gt 0 ]]; do
case $1 in
Expand All @@ -33,10 +52,18 @@ while [[ $# -gt 0 ]]; do
s3=1
shift
;;
--single_end)
single_end=1
shift
;;
--output_path)
output_path="$2"
shift 2
;;
--help)
print_usage
exit 0
;;
--group_file) # Optional group file
group_file="$2"
shift 2
Expand All @@ -47,20 +74,22 @@ while [[ $# -gt 0 ]]; do
;;
*)
echo "Unknown option: $1"
print_usage
exit 1
;;
esac
done

# Check if all required parameters are provided
if [[ -z "$dir_path" || -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
echo "Error: dir_path, forward_suffix, and reverse_suffix are required."
if [[ -z "$dir_path" || -z "$single_end" ]]; then
echo "Error: dir_path and single_end are required."
echo -e "\nUsage: $0 [options]"
echo -e "\nRequired arguments:"
echo -e " --dir_path <path> Directory containing FASTQ files"
echo -e " --forward_suffix <suffix> Suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
echo -e " --reverse_suffix <suffix> Suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
echo -e " --single_end Flag for single-end data"
echo -e "\nOptional arguments:"
echo -e " --forward_suffix <suffix> When single_end is 0, suffix identifying forward reads, supports regex (e.g., '_R1_001' or '_1')"
echo -e " --reverse_suffix <suffix> When single_end is 0, suffix identifying reverse reads, supports regex (e.g., '_R2_001' or '_2')"
echo -e " --s3 Use if files are stored in S3 bucket"
echo -e " --output_path <path> Output path for samplesheet [default: samplesheet.csv]"
echo -e " --group_file <path> Path to group file for sample grouping [header column must have the names 'sample,group' in that order; additional columns may be included, however they will be ignored by the script]"
Expand All @@ -74,15 +103,28 @@ if $group_across_illumina_lanes && [[ -n "$group_file" ]]; then
exit 1
fi

if [ $single_end -eq 0 ]; then
# Paired-end validation
if [[ -z "$forward_suffix" || -z "$reverse_suffix" ]]; then
echo "Error: forward_suffix and reverse_suffix are required for paired-end reads."
print_usage
exit 1
fi
fi

# Display the parameters
echo "Parameters:"
echo "dir_path: $dir_path"
echo "forward_suffix: $forward_suffix"
echo "reverse_suffix: $reverse_suffix"
echo "single_end: $single_end"
echo "s3: $s3"
echo "output_path: $output_path"
echo "group_file: $group_file"
echo "group_across_illumina_lanes: $group_across_illumina_lanes"
if [ $single_end -eq 0 ]; then
echo "forward_suffix: $forward_suffix"
echo "reverse_suffix: $reverse_suffix"
fi



#### EXAMPLES ####
Expand All @@ -109,30 +151,45 @@ echo "group_across_illumina_lanes: $group_across_illumina_lanes"
# Create a temporary file for the initial samplesheet
temp_samplesheet=$(mktemp)

echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
# Create header based on single_end flag
if [ $single_end -eq 0 ]; then
echo "sample,fastq_1,fastq_2" > "$temp_samplesheet"
else
echo "sample,fastq" > "$temp_samplesheet"
fi
echo "group_file: $group_file"


# Ensure dir_path ends with a '/'
if [[ "$dir_path" != */ ]]; then
dir_path="${dir_path}/"
fi

listing=0

# Get file listing based on s3 flag
if [ $s3 -eq 1 ]; then
listing=$(aws s3 ls ${dir_path} | awk '{print $4}')
else
listing=$(ls ${dir_path} | awk '{print $1}')
fi

echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
# If sample + reverse_suffix exists in s3_listing, then add to samplesheet
if [ -n "$reverse_read" ]; then
echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
fi
done
# Process files based on single_end flag
if [ $single_end -eq 0 ]; then
# Paired-end processing
echo "$listing" | grep "${forward_suffix}\.fastq\.gz$" | while read -r forward_read; do
sample=$(echo "$forward_read" | sed -E "s/${forward_suffix}\.fastq\.gz$//")
reverse_read=$(echo "$listing" | grep "${sample}${reverse_suffix}\.fastq\.gz$")
# If sample + reverse_suffix exists in s3_listing, then add to samplesheet
if [ -n "$reverse_read" ]; then
echo "$sample,${dir_path}${forward_read},${dir_path}${reverse_read}" >> "$temp_samplesheet"
fi
done
else
# Single-end processing - just process all fastq.gz files
echo "$listing" | grep "\.fastq\.gz$" | while read -r read_file; do
sample=$(echo "$read_file" | sed -E "s/\.fastq\.gz$//")
echo "$sample,${dir_path}${read_file}" >> "$temp_samplesheet"
done
fi

# Check if group file is provided
if [[ -n "$group_file" ]]; then
Expand Down
6 changes: 6 additions & 0 deletions configs/read_type.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Universal flags for read type (single-end vs paired-end)

params {
// Whether the underlying data is paired-end or single-end
single_end = new File(params.sample_sheet).text.readLines()[0].contains('fastq_2') ? false : true
}
3 changes: 2 additions & 1 deletion configs/run.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ params {

// Directories
base_dir = "s3://nao-mgs-wb/test-batch" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

// Files
sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
Expand All @@ -31,4 +31,5 @@ includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/resources.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/output.config"
includeConfig "${projectDir}/configs/read_type.config"
process.queue = "will-batch-queue" // AWS Batch job queue
34 changes: 34 additions & 0 deletions configs/run_dev_se.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/************************************************
| CONFIGURATION FILE FOR NAO VIRAL MGS WORKFLOW |
************************************************/

params {
mode = "run_dev_se"

// Directories
base_dir = "s3://nao-mgs-simon/test_single_read" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

// Files
sample_sheet = "${launchDir}/samplesheet.csv" // Path to library TSV
adapters = "${projectDir}/ref/adapters.fasta" // Path to adapter file for adapter trimming

// Numerical
grouping = false // Whether to group samples by 'group' column in samplesheet
n_reads_trunc = 0 // Number of reads per sample to run through pipeline (0 = all reads)
n_reads_profile = 1000000 // Number of reads per sample to run through taxonomic profiling
bt2_score_threshold = 20 // Normalized score threshold for HV calling (typically 15 or 20)
blast_hv_fraction = 0 // Fraction of putative HV reads to BLAST vs nt (0 = don't run BLAST)
kraken_memory = "128 GB" // Memory needed to safely load Kraken DB
quality_encoding = "phred33" // FASTQ quality encoding (probably phred33, maybe phred64)
fuzzy_match_alignment_duplicates = 0 // Fuzzy matching the start coordinate of reads for identification of duplicates through alignment (0 = exact matching; options are 0, 1, or 2)
host_taxon = "vertebrate"
}

includeConfig "${projectDir}/configs/logging.config"
includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/resources.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/output.config"
includeConfig "${projectDir}/configs/read_type.config"
process.queue = "simon-batch-queue" // AWS Batch job queue
3 changes: 2 additions & 1 deletion configs/run_validation.config
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ params {

// Directories
base_dir = "s3://nao-mgs-wb/test-remote" // Parent for working and output directories (can be S3)
ref_dir = "s3://nao-mgs-wb/index-20241113/output" // Reference/index directory (generated by index workflow)
ref_dir = "s3://nao-mgs-wb/index/20241209/output" // Reference/index directory (generated by index workflow)

// Files
viral_tsv_collapsed = "${base_dir}/results/virus_hits_db.tsv.gz"
Expand All @@ -24,4 +24,5 @@ includeConfig "${projectDir}/configs/containers.config"
includeConfig "${projectDir}/configs/resources.config"
includeConfig "${projectDir}/configs/profiles.config"
includeConfig "${projectDir}/configs/output.config"
includeConfig "${projectDir}/configs/read_type.config"
process.queue = "will-batch-queue" // AWS Batch job queue
3 changes: 3 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
include { RUN } from "./workflows/run"
include { RUN_VALIDATION } from "./workflows/run_validation"
include { INDEX } from "./workflows/index"
include { RUN_DEV_SE } from "./workflows/run_dev_se"

workflow {
if (params.mode == "index") {
Expand All @@ -9,6 +10,8 @@ workflow {
RUN()
} else if (params.mode == "run_validation") {
RUN_VALIDATION()
} else if (params.mode == "run_dev_se") {
RUN_DEV_SE()
}
}

Expand Down
38 changes: 37 additions & 1 deletion modules/local/fastp/main.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process FASTP {
process FASTP_PAIRED {
label "max"
label "fastp"
input:
Expand Down Expand Up @@ -32,6 +32,41 @@ process FASTP {
'''
}

process FASTP_SINGLE {
label "max"
label "fastp"
input:
// reads is a list of two files: forward/reverse reads
tuple val(sample), path(reads)
path(adapters)
output:
tuple val(sample), path("${sample}_fastp.fastq.gz"), emit: reads
tuple val(sample), path("${sample}_fastp_failed.fastq.gz"), emit: failed
tuple val(sample), path("${sample}_fastp.{json,html}"), emit: log
shell:
/* Cleaning not done in CUTADAPT or TRIMMOMATIC:
* Higher quality threshold for sliding window trimming;
* Removing poly-X tails;
* Automatic adapter detection;
* Base correction in overlapping paired-end reads;
* Filter low complexity reads.
*/
'''
# Define paths and subcommands
of=!{sample}_fastp_failed.fastq.gz
oj=!{sample}_fastp.json
oh=!{sample}_fastp.html
ad=!{adapters}
o=!{sample}_fastp.fastq.gz
io="--in1 !{reads[0]} --out1 ${o} --failed_out ${of} --html ${oh} --json ${oj} --adapter_fasta ${ad}"
par="--cut_front --cut_tail --correction --detect_adapter_for_pe --trim_poly_x --cut_mean_quality 20 --average_qual 20 --qualified_quality_phred 20 --verbose --dont_eval_duplication --thread !{task.cpus} --low_complexity_filter"
# Execute
fastp ${io} ${par}
'''
}



// Run FASTP for adapter trimming but don't trim for quality
process FASTP_NOTRIM {
label "max"
Expand Down Expand Up @@ -66,3 +101,4 @@ process FASTP_NOTRIM {
fastp ${io} ${par}
'''
}

5 changes: 3 additions & 2 deletions modules/local/summarizeMultiqcPair/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ process SUMMARIZE_MULTIQC_PAIR {
label "single"
input:
tuple val(stage), val(sample), path(multiqc_data)
val(single_end)
output:
tuple path("${stage}_${sample}_qc_basic_stats.tsv.gz"), path("${stage}_${sample}_qc_adapter_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_base_stats.tsv.gz"), path("${stage}_${sample}_qc_quality_sequence_stats.tsv.gz")
shell:
'''
summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -o ${PWD}
summarize-multiqc-pair.R -i !{multiqc_data} -s !{stage} -S !{sample} -r !{single_end} -o ${PWD}
'''
}
}
Loading

0 comments on commit 3a4c2f1

Please sign in to comment.