From 33d1ca88a9fea572d3cf9119adc4e07289bc6eae Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:16:04 -0300 Subject: [PATCH 01/13] Delete merge_pear_auto.sh --- 1_merge/merge_pear_auto.sh | 72 -------------------------------------- 1 file changed, 72 deletions(-) delete mode 100644 1_merge/merge_pear_auto.sh diff --git a/1_merge/merge_pear_auto.sh b/1_merge/merge_pear_auto.sh deleted file mode 100644 index cbca7d3..0000000 --- a/1_merge/merge_pear_auto.sh +++ /dev/null @@ -1,72 +0,0 @@ -#/bin/bash -module --quiet purge -module load StdEnv -module load PEAR/0.9.11-GCCcore-9.3.0 - -INPUT_F='' -INPUT_R='' -THREADS='1' -PVALUE='1' -OVERLAP='1' -OUTPUT='' -UNCALLED='0' -QUAL='20' - - -usage () { - echo "##################################################" - echo " " - echo "Merging of paired end fastq files from Illumina sequencing using Pear." - echo " " - echo "Usage: ${0} [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [-t 4]" - echo "-f R1.fastq original file" - echo "-r R2.fastq original file" - echo "-o output name for the assembled fastq file" - echo "-p p-value: statistical test for true assembly. Lower p-value means less possibility of overlapping by chance. Options are: 0.0001, 0.001, 0.01, 0.05 and 1.0" - echo "-s minimum overlap size." - echo "-t threads" - echo "-h print this help" - echo " " - echo "##################################################" - 1>&2; exit 1; - -} - -while getopts "f:r:o:p:s:t:h" option; do - case $option in - f) INPUT_F="${OPTARG}" - ;; - r) INPUT_R="${OPTARG}" - ;; - o) OUTPUT="${OPTARG}" - ;; - p) PVALUE="${OPTARG}" - ;; - s) OVERLAP="${OPTARG}" - ;; - t) THREADS="${OPTARG}" - ;; - h | *) usage - exit 0 - ;; - \?) echo "Invalid option: -$OPTARG" - exit 1 - ;; - esac -done - -if [ -z "$INPUT_F" ] || [ -z "$INPUT_R" ] || [ -z "$OUTPUT" ] ; then - echo 'Missing argument' >&2 - exit 1 -fi - - pear -j ${THREADS} \ - -p ${PVALUE} \ - -v ${OVERLAP} \ - -q ${QUAL} \ - -u ${UNCALLED} \ - -f ${INPUT_F} \ - -r ${INPUT_R} \ - -o ${OUTPUT} - - From 3ec7277f6c6d1e387fbc7205df7947c89ce32a5e Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:17:28 -0300 Subject: [PATCH 02/13] Updated executable type ./1_merge_pear.sh -h --- 1_merge/1_merge_pear.sh | 73 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 1_merge/1_merge_pear.sh diff --git a/1_merge/1_merge_pear.sh b/1_merge/1_merge_pear.sh new file mode 100644 index 0000000..ff88f4c --- /dev/null +++ b/1_merge/1_merge_pear.sh @@ -0,0 +1,73 @@ +#/bin/bash + +INPUT_F='' +INPUT_R='' +THREADS='1' +PVALUE='1' +OVERLAP='1' +OUTPUT='' +UNCALLED='0' +QUAL='20' + + +usage () { + echo "##################################################" + echo " " + echo "Merging of paired end fastq files from Illumina sequencing using Pear." + echo " " + echo "Usage: ${0} [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [-t 4]" + echo "-f R1.fastq original file" + echo "-r R2.fastq original file" + echo "-o output name for the assembled fastq file" + echo "-p p-value: statistical test for true assembly. Lower p-value means less possibility of overlapping by chance. Options are: 0.0001, 0.001, 0.01, 0.05 and 1.0" + echo "-s minimum overlap size." + echo "-t threads" + echo "-h print this help" + echo " " + echo "##################################################" + 2>/dev/null; exit 1; + +} + +while getopts "f:r:o:p:s:t:h" option; do + case $option in + f) INPUT_F="${OPTARG}" + ;; + r) INPUT_R="${OPTARG}" + ;; + o) OUTPUT="${OPTARG}" + ;; + p) PVALUE="${OPTARG}" + ;; + s) OVERLAP="${OPTARG}" + ;; + t) THREADS="${OPTARG}" + ;; + h | *) usage + 2>/dev/null; exit 0 + ;; + \?) echo "Invalid option: -$OPTARG" + 2>/dev/null; exit 1 + ;; + esac +done + +if [ -z "$INPUT_F" ] || [ -z "$INPUT_R" ] || [ -z "$OUTPUT" ] ; then + echo 'Missing argument' 2>/dev/null + exit 1 +fi + +module --quiet purge +module load StdEnv +module load PEAR/0.9.11-GCCcore-9.3.0 + +pear -j ${THREADS} \ + -p ${PVALUE} \ + -v ${OVERLAP} \ + -q ${QUAL} \ + -u ${UNCALLED} \ + -f ${INPUT_F} \ + -r ${INPUT_R} \ + -o ${OUTPUT} + + From aab00b2da8f277d2b6d8f52e9d82633906a3db9c Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:26:49 -0300 Subject: [PATCH 03/13] Update run_merge_pear.slurm --- 1_merge/run_merge_pear.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/1_merge/run_merge_pear.slurm b/1_merge/run_merge_pear.slurm index 10ae6b5..78ff266 100644 --- a/1_merge/run_merge_pear.slurm +++ b/1_merge/run_merge_pear.slurm @@ -18,6 +18,6 @@ set -o nounset ./merge_pear.sh #or -#./merge_pear_auto.sh -f R1.fq -r R2.fq -o output -p 0.001 -s 20 -t 8 +#./1_merge_pear.sh [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [-t 8] exit 0 From e8d7307a147948cb271d0c58293692ea2473b8da Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:28:07 -0300 Subject: [PATCH 04/13] Update README.merge --- 1_merge/README.merge | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/1_merge/README.merge b/1_merge/README.merge index ca48133..8aab94f 100644 --- a/1_merge/README.merge +++ b/1_merge/README.merge @@ -1,6 +1,8 @@ -./merge_pear_auto.sh -h - -Usage: ./merge_pear_auto.sh [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [-t 4] +./1_merge_pear.sh -h +################################################## +Merging of paired end fastq files from Illumina sequencing using Pear. + +Usage: ./1_merge_pear.sh [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [-t 4] -f R1.fastq original file -r R2.fastq original file -o output name for the assembled fastq file @@ -8,8 +10,8 @@ Usage: ./merge_pear_auto.sh [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] -s minimum overlap size. -t threads -h print this help - ################################################## + MERGING STEP -> PEAR IF YOU WOULD LIKE TO EDIT THE SCRIPT BY YOURSELF: 1. Open the merging_pear.sh and replace the name of your original fastq files R1 and R2 From ee2b0ec668e14079c8d6ffc407926137d4ebe52a Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:32:02 -0300 Subject: [PATCH 05/13] updated executables --- 2_demulti/2a_preparing_tags_LCPI.pl | 252 +++++++++++++++++++++ 2_demulti/2b_demulti_dual_index_ionboth.sh | 45 ++++ 2_demulti/2b_demulti_dual_index_linked.sh | 45 ++++ 3 files changed, 342 insertions(+) create mode 100644 2_demulti/2a_preparing_tags_LCPI.pl create mode 100644 2_demulti/2b_demulti_dual_index_ionboth.sh create mode 100644 2_demulti/2b_demulti_dual_index_linked.sh diff --git a/2_demulti/2a_preparing_tags_LCPI.pl b/2_demulti/2a_preparing_tags_LCPI.pl new file mode 100644 index 0000000..1b06b8c --- /dev/null +++ b/2_demulti/2a_preparing_tags_LCPI.pl @@ -0,0 +1,252 @@ +#!/usr/bin/perl +use diagnostics; +use warnings; +use strict; +package ECHO_MODULE; +package main; + +my $file = $ARGV[0]; +my $subname = $ARGV[1]; +open (MYFILE, $file); +my @file = ; +close (MYFILE); + +if ($subname eq 'linked') { + &linked; +} +if ($subname eq 'combinatorial') { + &combinatorial; +} +if ($subname eq 'unique') { + &unique; +} +if ($subname eq 'ion') { + &ion; +} +if ($subname eq 'ion3') { + &ion3; +} +if ($subname eq 'ion5') { + &ion5; +} +elsif ($subname eq 'ionboth') { + &ionboth; +} + +sub linked +{ +open (NEW_FILE1, '>>Tags_LA1.fa'); +open (NEW_FILE2, '>>Tags_LA2.fa'); +open (NEW_FILE3, '>>Tags_LA3.fa'); +open (NEW_FILE4, '>>Tags_LA4.fa'); +my @new_file1=(); +my @new_file2=(); +my @new_file3=(); +my @new_file4=(); + foreach my $line (@file) { + chomp ($line); + $line =~ s/\R//g; + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@new_file1, (">$sample\n^$tag_F...$tag_R\$\n")); + push (@new_file2, (">$sample\n^$RCtagR...$RCtagF\$\n")); + push (@new_file3, (">$sample\n^$tag_F...$RCtagR\$\n")); + push (@new_file4, (">$sample\n^$tag_R...$RCtagF\$\n")); + } + print NEW_FILE1 @new_file1; + print NEW_FILE2 @new_file2; + print NEW_FILE3 @new_file3; + print NEW_FILE4 @new_file4; +} +sub combinatorial +{ +open (NEW_FILE1, '>>Tags_F.fa'); +open (NEW_FILE2, '>>Tags_R.fa'); +my @new_file1=(); +my @new_file2=(); + foreach my $line (@file) { + chomp ($line); + $line =~ s/\R//g; + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + push (@new_file1, (">$sample\n$tag_F\n")); + push (@new_file2, (">$sample\n^$tag_R\n")); + } + print NEW_FILE1 @new_file1; + print NEW_FILE2 @new_file2; +} +sub unique +{ +open (NEW_FILE1, '>>Tags_R1.fa'); +open (NEW_FILE2, '>>Tags_R2.fa'); +open (NEW_FILE3, '>>Tags_R1_RC.fa'); +open (NEW_FILE4, '>>Tags_R2_RC.fa'); +my @new_file1=(); +my @new_file2=(); +my @new_file3=(); +my @new_file4=(); + foreach my $line (@file) { + chomp ($line); + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@new_file1, (">$sample\n$tag_F\n")); + push (@new_file2, (">$sample\n$tag_R\n")); + push (@new_file3, (">$sample\n$RCtagF\n")); + push (@new_file4, (">$sample\n$RCtagR\n")); + } + print NEW_FILE1 @new_file1; + print NEW_FILE2 @new_file2; + print NEW_FILE3 @new_file3; + print NEW_FILE4 @new_file4; +} +sub ion +{ +open (ALT1, '>>Tags_alt1.fa'); +open (ALT2, '>>Tags_alt2.fa'); +open (ALT3, '>>Tags_alt3.fa'); +open (ALT4, '>>Tags_alt4.fa'); +my @tags_alt_1=(); +my @tags_alt_2=(); +my @tags_alt_3=(); +my @tags_alt_4=(); + foreach my $line (@file) { + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + $tag_F =~ s/[^\p{PosixAlnum},]//g; + $tag_R =~ s/[^\p{PosixAlnum},]//g; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@tags_alt_1, (">$sample\n$tag_F...$RCtagR\n")); + push (@tags_alt_2, (">$sample\n$RCtagF...$tag_R\n")); + push (@tags_alt_3, (">$sample\n$tag_R...$RCtagF\n")); + push (@tags_alt_4, (">$sample\n$RCtagR...$tag_F\n")); + } + print ALT1 @tags_alt_1; + print ALT2 @tags_alt_2; + print ALT3 @tags_alt_3; + print ALT4 @tags_alt_4; +} +sub ion3 +{ +open (ALT1, '>>Tags_alt1_3anch.fa'); +open (ALT2, '>>Tags_alt2_3anch.fa'); +open (ALT3, '>>Tags_alt3_3anch.fa'); +open (ALT4, '>>Tags_alt4_3anch.fa'); +my @tags_alt_1_3anch=(); +my @tags_alt_2_3anch=(); +my @tags_alt_3_3anch=(); +my @tags_alt_4_3anch=(); + foreach my $line (@file) { + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + $tag_F =~ s/[^\p{PosixAlnum},]//g; + $tag_R =~ s/[^\p{PosixAlnum},]//g; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@tags_alt_1_3anch, (">$sample\n$tag_F...$RCtagR\$\n")); + push (@tags_alt_2_3anch, (">$sample\n$RCtagF...$tag_R\$\n")); + push (@tags_alt_3_3anch, (">$sample\n$tag_R...$RCtagF\$\n")); + push (@tags_alt_4_3anch, (">$sample\n$RCtagR...$tag_F\$\n")); + } + print ALT1 @tags_alt_1_3anch; + print ALT2 @tags_alt_2_3anch; + print ALT3 @tags_alt_3_3anch; + print ALT4 @tags_alt_4_3anch; +} +sub ion5 +{ +open (ALT1, '>>Tags_alt1_5anch.fa'); +open (ALT2, '>>Tags_alt2_5anch.fa'); +open (ALT3, '>>Tags_alt3_5anch.fa'); +open (ALT4, '>>Tags_alt4_5anch.fa'); +my @tags_alt_1_5anch=(); +my @tags_alt_2_5anch=(); +my @tags_alt_3_5anch=(); +my @tags_alt_4_5anch=(); + foreach my $line (@file) { + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + $tag_F =~ s/[^\p{PosixAlnum},]//g; + $tag_R =~ s/[^\p{PosixAlnum},]//g; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@tags_alt_1_5anch, (">$sample\n^$tag_F...$RCtagR\n")); + push (@tags_alt_2_5anch, (">$sample\n^$RCtagF...$tag_R\n")); + push (@tags_alt_3_5anch, (">$sample\n^$tag_R...$RCtagF\n")); + push (@tags_alt_4_5anch, (">$sample\n^$RCtagR...$tag_F\n")); + } + print ALT1 @tags_alt_1_5anch; + print ALT2 @tags_alt_2_5anch; + print ALT3 @tags_alt_3_5anch; + print ALT4 @tags_alt_4_5anch; +} +sub ionboth +{ +open (ALT1, '>>Tags_alt1_bothanch.fa'); +open (ALT2, '>>Tags_alt2_bothanch.fa'); +open (ALT3, '>>Tags_alt3_bothanch.fa'); +open (ALT4, '>>Tags_alt4_bothanch.fa'); +my @tags_alt_1_bothanch=(); +my @tags_alt_2_bothanch=(); +my @tags_alt_3_bothanch=(); +my @tags_alt_4_bothanch=(); + foreach my $line (@file) { + my @fields= split(/\t/, $line); + my $sample=$fields[0]; + my $tag_F=$fields[1]; + my $tag_R=$fields[2]; + $tag_F =~ s/[^\p{PosixAlnum},]//g; + $tag_R =~ s/[^\p{PosixAlnum},]//g; + my $RCtagF=reverse $tag_F; + $RCtagF =~ tr/ATGCatgc/TACGtacg/; + $RCtagF =~ s/[^\p{PosixAlnum},]//g; + my $RCtagR=reverse $tag_R; + $RCtagR =~ tr/ATGCatgc/TACGtacg/; + $RCtagR =~ s/[^\p{PosixAlnum},]//g; + push (@tags_alt_1_bothanch, (">$sample\n^$tag_F...$RCtagR\$\n")); + push (@tags_alt_2_bothanch, (">$sample\n^$RCtagF...$tag_R\$\n")); + push (@tags_alt_3_bothanch, (">$sample\n^$tag_R...$RCtagF\$\n")); + push (@tags_alt_4_bothanch, (">$sample\n^$RCtagR...$tag_F\$\n")); + } + print ALT1 @tags_alt_1_bothanch; + print ALT2 @tags_alt_2_bothanch; + print ALT3 @tags_alt_3_bothanch; + print ALT4 @tags_alt_4_bothanch; +} + diff --git a/2_demulti/2b_demulti_dual_index_ionboth.sh b/2_demulti/2b_demulti_dual_index_ionboth.sh new file mode 100644 index 0000000..1d2d76d --- /dev/null +++ b/2_demulti/2b_demulti_dual_index_ionboth.sh @@ -0,0 +1,45 @@ +#/bin/bash +##RUN preparing_tags_LCPI.pl to format your barcodes files. +#input mapping file format: +#Sample1 tagF tagR +#Sample2 ACCTGAAT ATACAGA +####tab delimited! +#check this mapping file for duplicates in excel before sending to cluster +#write sample names without space, e.g sample 23 as sample_23 or sample23. +####DO NOT USE NUMBERS in the beginning of your sample names +#perl preparing_tags_LCPI.pl + #my_mapping_file.txt + #linked +#the perl script should create 3 barcode files, Barcodes_LA1.txt, Barcodes_LA2.txt, Barcodes_LA3.txt for 'linked' +#the linked mode is 5' and 3' anchored + +#ANY CUTADAPT ISSUE OR DOUBTS, SEE: https://cutadapt.readthedocs.io/en/stable/guide.html + +module --quiet purge +module load StdEnv +module load cutadapt/2.10-GCCcore-9.3.0-Python-3.8.2 + +INPUT="${1}" +ERR="${2}" +PAIR1="Tags_alt1_bothanch.fa" +PAIR2="Tags_alt1_bothanch.fa" +PAIR3="Tags_alt1_bothanch.fa" +PAIR4="Tags_alt1_bothanch.fa" + +### demultiplex (Linked Adapter) + +cutadapt \ + --quiet \ + -a file:${PAIR1} \ + -a file:${PAIR2} \ + -a file:${PAIR3} \ + -a file:${PAIR4} \ + -o "{name}_Ion_LA.fq" \ + -e ${ERR} \ + --action=lowercase \ + ${INPUT} + +mkdir demulti_ionboth_${ERR}err +mv *.fq demulti_ionboth_${ERR}err +./count_fastq_sequences.sh demulti_ionboth_${ERR}err/*.fq > demulti_ionboth_${ERR}err_count.txt + diff --git a/2_demulti/2b_demulti_dual_index_linked.sh b/2_demulti/2b_demulti_dual_index_linked.sh new file mode 100644 index 0000000..87cd9fb --- /dev/null +++ b/2_demulti/2b_demulti_dual_index_linked.sh @@ -0,0 +1,45 @@ +#/bin/bash +##RUN preparing_tags_LCPI.pl to format your barcodes files. +#input mapping file format: +#Sample1 tagF tagR +#Sample2 ACCTGAAT ATACAGA +####tab delimited! +#check this mapping file for duplicates in excel before sending to cluster +#write sample names without space, e.g sample 23 as sample_23 or sample23. +####DO NOT USE NUMBERS in the beginning of your sample names +#perl preparing_tags_LCPI.pl + #my_mapping_file.txt + #linked +#the perl script should create 3 barcode files, Barcodes_LA1.txt, Barcodes_LA2.txt, Barcodes_LA3.txt for 'linked' +#the linked mode is 5' and 3' anchored + +#ANY CUTADAPT ISSUE OR DOUBTS, SEE: https://cutadapt.readthedocs.io/en/stable/guide.html + +module --quiet purge +module load StdEnv +module load cutadapt/2.10-GCCcore-9.3.0-Python-3.8.2 + +INPUT="${1}" +ERR="${2}" +PAIR1="Tags_LA1.fa" +PAIR2="Tags_LA2.fa" +PAIR3="Tags_LA3.fa" +PAIR4="Tags_LA4.fa" + +### demultiplex (Linked Adapter) + +cutadapt \ + --quiet \ + -a file:${PAIR1} \ + -a file:${PAIR2} \ + -a file:${PAIR3} \ + -a file:${PAIR4} \ + -o "{name}_LA.fq" \ + -e ${ERR} \ + --action=lowercase \ + ${INPUT} + +mkdir demulti_linked_${ERR}err +mv *.fq demulti_linked_${ERR}err +./count_fastq_sequences.sh demulti_linked_${ERR}err/*.fq > demulti_linked_${ERR}err_count.txt + From 2ef1081f90e28d8172a14c95b5e3e867662b92c4 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:33:25 -0300 Subject: [PATCH 06/13] Delete demulti_dual_index_linked.sh --- 2_demulti/demulti_dual_index_linked.sh | 40 -------------------------- 1 file changed, 40 deletions(-) delete mode 100644 2_demulti/demulti_dual_index_linked.sh diff --git a/2_demulti/demulti_dual_index_linked.sh b/2_demulti/demulti_dual_index_linked.sh deleted file mode 100644 index 5683a3a..0000000 --- a/2_demulti/demulti_dual_index_linked.sh +++ /dev/null @@ -1,40 +0,0 @@ -#/bin/bash -##FOR DUAL INDEX DESIGN, YOU MUST RUN preparing_tags_LCPI.pl to format your barcodes files. -#input mapping file format: -#Sample1 tagF tagR -#Sample2 ACCTGAAT ATACAGA -#tab delimited! -#check this mapping file for duplicates in excel before sending to cluster -#write sample names without space, e.g SamPLe 23 is not allowed, but SamPLe_23 is. -#do not use numbers in the beginning of your sample names -#perl preparing_tags_LCPI.pl - #my_mapping_file.txt - #linked -#the perl script should create 3 barcode files, Barcodes_LA1.txt, Barcodes_LA2.txt, Barcodes_LA3.txt for 'linked' -#the linked mode is 5' and 2' anchored by default - -#ANY CUTADAPT ISSUE OR DOUBTS, SEE: https://cutadapt.readthedocs.io/en/stable/guide.html - - -module --quiet purge -module load StdEnv -module load cutadapt/2.10-GCCcore-9.3.0-Python-3.8.2 - - -INPUT="my_training_set.assembled.fastq" -PAIR1="Barcodes_LA1.fa" -PAIR2="Barcodes_LA2.fa" -PAIR3="Barcodes_LA3.fa" - -### demultiplex (Linked Adapter) - -cutadapt \ - -a file:${PAIR1} \ - -a file:${PAIR2} \ - -a file:${PAIR3} \ - -o "{name}_LA.fq" \ - --action=lowercase \ - ${INPUT} - -mkdir demulti_linked -mv *.fq demulti_linked From ae4bdda3842202f63ad666bc68fdcb0b0a9cd67c Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:33:47 -0300 Subject: [PATCH 07/13] Delete demulti_dual_index_ion.sh --- 2_demulti/demulti_dual_index_ion.sh | 34 ----------------------------- 1 file changed, 34 deletions(-) delete mode 100644 2_demulti/demulti_dual_index_ion.sh diff --git a/2_demulti/demulti_dual_index_ion.sh b/2_demulti/demulti_dual_index_ion.sh deleted file mode 100644 index 1b79661..0000000 --- a/2_demulti/demulti_dual_index_ion.sh +++ /dev/null @@ -1,34 +0,0 @@ -#/bin/bash - -##FOR ION DUAL INDEX DESIGN, YOU MUST RUN preparing_tags_LCPI.pl to format your barcodes files. -#input mapping file format tab delimited: -#Sample1 tagF tagR -#Sample2 ACCTGAAT ATACAGA -#check this mapping file for duplicates in excel before sending to cluster -#write sample names without space, e.g SamPLe 23 is not allowed, but SamPLe_23 is. -#do not use numbers in the beginning of your sample names -#perl preparing_tags_LCPI.pl - #my_mapping_file.txt - #ion -#the perl script should create 4 files Barcodes_alt1.fa to Barcodes_alt4.fa - - -INPUT="my_ion_dual_index_single_read.fastq" -PAIR1="Barcodes_alt1.fa" -PAIR2="Barcodes_alt2.fa" -PAIR3="Barcodes_alt3.fa" -PAIR4="Barcodes_alt4.fa" -### demultiplex (Linked Adapter) - -cutadapt \ - -a file:${PAIR1} \ - -a file:${PAIR2} \ - -a file:${PAIR3} \ - -a file:${PAIR4} \ - -o "{name}_ion_LA.fq" \ - --action=lowercase \ - ${INPUT} - - -mkdir demulti_iondual -mv *.fq demulti_iondual From 8633daa60b30fce6439c698db2ee90bce52fef04 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:34:05 -0300 Subject: [PATCH 08/13] Delete demulti_unique_dual_index.sh --- 2_demulti/demulti_unique_dual_index.sh | 36 -------------------------- 1 file changed, 36 deletions(-) delete mode 100644 2_demulti/demulti_unique_dual_index.sh diff --git a/2_demulti/demulti_unique_dual_index.sh b/2_demulti/demulti_unique_dual_index.sh deleted file mode 100644 index 30b456d..0000000 --- a/2_demulti/demulti_unique_dual_index.sh +++ /dev/null @@ -1,36 +0,0 @@ -#/bin/bash - -###DEMULTIPLEXING ILLUMINA DUAL INDEXED EXACT PAIRS -###THIS STRATEGY IS EXCLUSIVE FOR PRIMER DESIGN UNIQUE, NO INDEX IS SHARED, ALL SAMPLES INDEPENDENT -##The preparing_tags_LCPI.pl only format your barcodes files as two forward fasta files and two reverse complement fasta files: -#input mapping file format tab delimited: -#Sample1 tagF tagR -#Sample2 ACCTGAAT ATACAGA -#output: -#Barcode_R1.fa Barcode_R2.fa Barcode_R1_RC.fa Barcode_R2_RC.fa -#>Sample1 >Sample1 >Sample1 >Sample1 -#ACCTGAAT ACCTACAG ATTCAGGT CTGTAGGT - -#check the mapping file for duplicates in excel before sending to server -#write sample names without space, e.g SamPLe 23 is not allowed, but SamPLe_23 is. -#do not use numbers in the beginning of your sample names -#perl preparing_tags_LCPI.pl - #my_mapping_file.txt - #unique -#ANY CUTADAPT ISSUE OR DOUBTS, SEE: -#https://cutadapt.readthedocs.io/en/stable/guide.html -#https://support.illumina.com/bulletins/2018/08/understanding-unique-dual-indexes--udi--and-associated-library-p.html - -INPUT_F="My_project_my_marker_R1.fastq" -INPUT_R="My_project_my_marker_R2.fastq" -R1_TAGS="Barcode_R1.fa" -R2_TAGS="Barcode_R2.fa" -R1_TAGS_RC="Barcode_R1_RC.fa" -R2_TAGS_RC="Barcode_R2_RC.fa" - -cutadapt --pair-adapters -a file:${R1_TAGS} -a file:${R2_TAGS_RC} -G file:${R2_TAGS} -G file:${R1_TAGS_RC} -o {name}.1.fq -p {name}.2.fq ${INPUT_R1} ${INPUT_R2} --info-file matches.txt - - -mkdir demulti_samples -mv *.fq demulti_samples - From 1c3f479cc199c392060ec79a57c8dced74651aa3 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:34:16 -0300 Subject: [PATCH 09/13] Delete preparing_tags_LCPI.pl --- 2_demulti/preparing_tags_LCPI.pl | 253 ------------------------------- 1 file changed, 253 deletions(-) delete mode 100644 2_demulti/preparing_tags_LCPI.pl diff --git a/2_demulti/preparing_tags_LCPI.pl b/2_demulti/preparing_tags_LCPI.pl deleted file mode 100644 index 2b7236a..0000000 --- a/2_demulti/preparing_tags_LCPI.pl +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/perl -use diagnostics; -use warnings; -use strict; -package ECHO_MODULE; -package main; - -print "#####DEMULTIPLEXING DUAL INDEXED LIBRARIES#####\n"; -print "EXPECTED INPUT:\nsample1\ttagF\ttagR\nsample2\ttagF\ttagR\n...\t#same as in your excel file\n##DO NOT INCLUDE PRIMERS NOW\n\n"; -print "Please, enter your mapping file to edit:\t"; -my $arq1 = ; -chomp $arq1; -open (MYFILE, $arq1); -my @file = ; -close (MYFILE); - -print ">>>For Illumina merged reads type 'linked'\n>>>For Illumina combinatorial type 'combinatorial'\n>>>For Illumina exact paired dual index type 'unique'\n>>>For Ion torrent dual index type 'ion'\n>>>For Ion dual index 3' anchored 'ion3'\n>>>For Ion dual index 5' anchored 'ion5'\n>>>For Ion dual index both anchored 'ion-both'\t"; -chop (my $subname = ); -if ($subname eq 'linked') { - &linked; -} -if ($subname eq 'combinatorial') { - &combinatorial; -} -if ($subname eq 'unique') { - &unique; -} -if ($subname eq 'ion') { - &ion; -} -if ($subname eq 'ion3') { - &ion3; -} -if ($subname eq 'ion5') { - &ion5; -} -elsif ($subname eq 'ion-both') { - &ionboth; -} - -sub linked -{ -open (NEW_FILE1, '>>Barcodes_LA1.fa'); -open (NEW_FILE2, '>>Barcodes_LA2.fa'); -open (NEW_FILE3, '>>Barcodes_LA3.fa'); -my @new_file1=(); -my @new_file2=(); -my @new_file3=(); - foreach my $line (@file) { - chomp ($line); - $line =~ s/\R//g; - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@new_file1, (">$sample\n^$tag_F...$tag_R\$\n")); - push (@new_file2, (">$sample\n^$RCtagR...$RCtagF\$\n")); - push (@new_file3, (">$sample\n^$tag_F...$RCtagR\$\n")); - } - print NEW_FILE1 @new_file1; - print NEW_FILE2 @new_file2; - print NEW_FILE3 @new_file3; -} -sub combinatorial -{ -open (NEW_FILE1, '>>Barcodes_F.fa'); -open (NEW_FILE2, '>>Barcodes_R.fa'); -my @new_file1=(); -my @new_file2=(); - foreach my $line (@file) { - chomp ($line); - $line =~ s/\R//g; - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - push (@new_file1, (">$sample\n$tag_F\n")); - push (@new_file2, (">$sample\n^$tag_R\n")); - } - print NEW_FILE1 @new_file1; - print NEW_FILE2 @new_file2; -} -sub unique -{ -open (NEW_FILE1, '>>Barcode_R1.fa'); -open (NEW_FILE2, '>>Barcode_R2.fa'); -open (NEW_FILE3, '>>Barcode_R1_RC.fa'); -open (NEW_FILE4, '>>Barcode_R2_RC.fa'); -my @new_file1=(); -my @new_file2=(); -my @new_file3=(); -my @new_file4=(); - foreach my $line (@file) { - chomp ($line); - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@new_file1, (">$sample\n$tag_F\n")); - push (@new_file2, (">$sample\n$tag_R\n")); - push (@new_file3, (">$sample\n$RCtagF\n")); - push (@new_file4, (">$sample\n$RCtagR\n")); - } - print NEW_FILE1 @new_file1; - print NEW_FILE2 @new_file2; - print NEW_FILE3 @new_file3; - print NEW_FILE4 @new_file4; -} -sub ion -{ -open (ALT1, '>>Barcodes_alt1.fa'); -open (ALT2, '>>Barcodes_alt2.fa'); -open (ALT3, '>>Barcodes_alt3.fa'); -open (ALT4, '>>Barcodes_alt4.fa'); -my @tags_alt_1=(); -my @tags_alt_2=(); -my @tags_alt_3=(); -my @tags_alt_4=(); - foreach my $line (@file) { - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - $tag_F =~ s/[^\p{PosixAlnum},]//g; - $tag_R =~ s/[^\p{PosixAlnum},]//g; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@tags_alt_1, (">$sample\n$tag_F...$RCtagR\n")); - push (@tags_alt_2, (">$sample\n$RCtagF...$tag_R\n")); - push (@tags_alt_3, (">$sample\n$tag_R...$RCtagF\n")); - push (@tags_alt_4, (">$sample\n$RCtagR...$tag_F\n")); - } - print ALT1 @tags_alt_1; - print ALT2 @tags_alt_2; - print ALT3 @tags_alt_3; - print ALT4 @tags_alt_4; -} -sub ion3 -{ -open (ALT1, '>>Barcodes_alt1_3anch.fa'); -open (ALT2, '>>Barcodes_alt2_3anch.fa'); -open (ALT3, '>>Barcodes_alt3_3anch.fa'); -open (ALT4, '>>Barcodes_alt4_3anch.fa'); -my @tags_alt_1_3anch=(); -my @tags_alt_2_3anch=(); -my @tags_alt_3_3anch=(); -my @tags_alt_4_3anch=(); - foreach my $line (@file) { - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - $tag_F =~ s/[^\p{PosixAlnum},]//g; - $tag_R =~ s/[^\p{PosixAlnum},]//g; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@tags_alt_1_3anch, (">$sample\n$tag_F...$RCtagR\$\n")); - push (@tags_alt_2_3anch, (">$sample\n$RCtagF...$tag_R\$\n")); - push (@tags_alt_3_3anch, (">$sample\n$tag_R...$RCtagF\$\n")); - push (@tags_alt_4_3anch, (">$sample\n$RCtagR...$tag_F\$\n")); - } - print ALT1 @tags_alt_1_3anch; - print ALT2 @tags_alt_2_3anch; - print ALT3 @tags_alt_3_3anch; - print ALT4 @tags_alt_4_3anch; -} -sub ion5 -{ -open (ALT1, '>>Barcodes_alt1_5anch.fa'); -open (ALT2, '>>Barcodes_alt2_5anch.fa'); -open (ALT3, '>>Barcodes_alt3_5anch.fa'); -open (ALT4, '>>Barcodes_alt4_5anch.fa'); -my @tags_alt_1_5anch=(); -my @tags_alt_2_5anch=(); -my @tags_alt_3_5anch=(); -my @tags_alt_4_5anch=(); - foreach my $line (@file) { - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - $tag_F =~ s/[^\p{PosixAlnum},]//g; - $tag_R =~ s/[^\p{PosixAlnum},]//g; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@tags_alt_1_5anch, (">$sample\n^$tag_F...$RCtagR\n")); - push (@tags_alt_2_5anch, (">$sample\n^$RCtagF...$tag_R\n")); - push (@tags_alt_3_5anch, (">$sample\n^$tag_R...$RCtagF\n")); - push (@tags_alt_4_5anch, (">$sample\n^$RCtagR...$tag_F\n")); - } - print ALT1 @tags_alt_1_5anch; - print ALT2 @tags_alt_2_5anch; - print ALT3 @tags_alt_3_5anch; - print ALT4 @tags_alt_4_5anch; -} -sub ionboth -{ -open (ALT1, '>>Barcodes_alt1_bothanch.fa'); -open (ALT2, '>>Barcodes_alt2_bothanch.fa'); -open (ALT3, '>>Barcodes_alt3_bothanch.fa'); -open (ALT4, '>>Barcodes_alt4_bothanch.fa'); -my @tags_alt_1_bothanch=(); -my @tags_alt_2_bothanch=(); -my @tags_alt_3_bothanch=(); -my @tags_alt_4_bothanch=(); - foreach my $line (@file) { - my @fields= split(/\t/, $line); - my $sample=$fields[0]; - my $tag_F=$fields[1]; - my $tag_R=$fields[2]; - $tag_F =~ s/[^\p{PosixAlnum},]//g; - $tag_R =~ s/[^\p{PosixAlnum},]//g; - my $RCtagF=reverse $tag_F; - $RCtagF =~ tr/ATGCatgc/TACGtacg/; - $RCtagF =~ s/[^\p{PosixAlnum},]//g; - my $RCtagR=reverse $tag_R; - $RCtagR =~ tr/ATGCatgc/TACGtacg/; - $RCtagR =~ s/[^\p{PosixAlnum},]//g; - push (@tags_alt_1_bothanch, (">$sample\n^$tag_F...$RCtagR\$\n")); - push (@tags_alt_2_bothanch, (">$sample\n^$RCtagF...$tag_R\$\n")); - push (@tags_alt_3_bothanch, (">$sample\n^$tag_R...$RCtagF\$\n")); - push (@tags_alt_4_bothanch, (">$sample\n^$RCtagR...$tag_F\$\n")); - } - print ALT1 @tags_alt_1_bothanch; - print ALT2 @tags_alt_2_bothanch; - print ALT3 @tags_alt_3_bothanch; - print ALT4 @tags_alt_4_bothanch; -} - From 96c846d9d32cdda99b45f656b0f3320965b9f752 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:46:58 -0300 Subject: [PATCH 10/13] Update README.demulti --- 2_demulti/README.demulti | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/2_demulti/README.demulti b/2_demulti/README.demulti index d859a3c..746340e 100644 --- a/2_demulti/README.demulti +++ b/2_demulti/README.demulti @@ -1,3 +1,9 @@ +######################################################################### +> perl 2a_preparing_tags_LCPI.pl my_mapping_file.txt linked +Output: +Tags_LA1.fa Tags_LA2.fa Tags_LA3.fa Tags_LA4.fa +> ./2b_demulti_dual_index_linked.sh my_data.assembled.fastq +######################################################################### If you are not familiar with this step, please read all the orientation: For all demultiplexing strategies based on cutadapt the mapping file must be a tab separated text file like this: @@ -6,21 +12,8 @@ sample1 AGGTACGCAATT CCTAAACTACGG sample2 ACAGCCACCCAT CCTAAACTACGG sample3 TGTCTCGCAAGC CCTAAACTACGG -then we format it according to your tag primer design using the perl script preparing_tags_LCPI.pl. The dominant tag orientation for Illumina dual index merged pairs: (3'tagF)...(5'RCtagR) -perl preparing_tags_LCPI.pl -#####DEMULTIPLEXING DUAL INDEXED LIBRARIES##### -EXPECTED INPUT: -sample1 tagF tagR -sample2 tagF tagR -... #same as in your excel file -##DO NOT INCLUDE PRIMERS NOW - -Please, enter your mapping file to edit: my_mapping_file.txt ->>>For Illumina merged reads type 'linked' ->>>For Illumina combinatorial type 'combinatorial' ->>>For Illumina exact paired dual index type 'unique' ->>>For Ion torrent dual index type 'ion' ->>> type your mode +then we format it according to your tag primer design using the perl script preparing_tags_LCPI.pl. +The dominant tag orientation for Illumina dual index merged pairs: (3'tagF)...(5'RCtagR) for merged R1-R2 Illumina files in linked mode, the tags ar formatted like this: head Barcodes_LA.fa From 8c90d73322e9d6bf3e23b306b7a322b29e965852 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:48:32 -0300 Subject: [PATCH 11/13] Update README.merge --- 1_merge/README.merge | 2 ++ 1 file changed, 2 insertions(+) diff --git a/1_merge/README.merge b/1_merge/README.merge index 8aab94f..b94c3f0 100644 --- a/1_merge/README.merge +++ b/1_merge/README.merge @@ -11,6 +11,8 @@ Usage: ./1_merge_pear.sh [-f R1.fq] [-r R2.fq] [-o output] [-p 0.001] [-s 20] [- -t threads -h print this help ################################################## +Output: +my_data.assembled.fastq MERGING STEP -> PEAR IF YOU WOULD LIKE TO EDIT THE SCRIPT BY YOURSELF: From aaac341c9c3a8aa9adba99053f83b0f1bd87ed45 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:50:36 -0300 Subject: [PATCH 12/13] updated executable ./3_tag_primer_clipping_clean_derep.sh -h --- .../3_tag_primer_clipping_clean_derep.sh | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 3_read_cleaning/3_tag_primer_clipping_clean_derep.sh diff --git a/3_read_cleaning/3_tag_primer_clipping_clean_derep.sh b/3_read_cleaning/3_tag_primer_clipping_clean_derep.sh new file mode 100644 index 0000000..2798bb0 --- /dev/null +++ b/3_read_cleaning/3_tag_primer_clipping_clean_derep.sh @@ -0,0 +1,117 @@ +#/bin/bash + +TAGS='' +PRIMER_F='' +PRIMER_R='' +PREFIX='' +DIR='' +PRIMER_F_RC='' +PRIMER_R_RC='' +MIN_F=$(( ${#PRIMER_F} * 2 / 3 )) +MIN_R=$(( ${#PRIMER_R} * 2 / 3 )) +MIN_LEN='' +EE='' + +usage () { + echo "##################################################" + echo "TAGS AND PRIMERS CLIPPING" + echo "##################################################" + echo "get your primer's reverse complement in this website:" + echo "http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html" + echo " " + echo "Usage: ${0} [-b tags file] [-F 5' primer 'Forward'] [-R 3' primer 'Reverse'] [-f revcom 5' primer] [-r revcom 3' primer] [-e 0.1] [-p prefix] [-l amplicon min length] [-d path to demultiplexed samples]" + echo "-b barcodes/tags mapping file (used in demultiplexing step)" + echo "-F 5' primer (forward direction, like in your primers design file)" + echo "-R 3' primer (forward direction)" + echo "-f 5' primer reverse complement" + echo "-r 3' primer reverse complement" + echo "-e maximum error rate (the same used for demultiplexing)" + echo "Error rate: allowed errors/adapter or primer length. 2 errors in a 12 bp tag = 2/12 = 0.17" + echo "4 errors in a 26bp degenerated primer = 4/26 = 0.16, 2 errors in a 20bp primer = 0.1 (Cutadapt default)." + echo "-p PREFIX to quality file (it MUST be the SAME prefix throughout the ENTIRE PIPELINE)" + echo "-l minimum length for the final amplicon" + echo "-d path to demultiplexed samples directory. Use pwd." + echo "-h print this help" + echo " " + echo "##################################################" + 1>&2; exit 1; + +} +while getopts "b:F:R:f:r:e:p:d:l:h" option; do + case $option in + b) TAGS="${OPTARG}" + ;; + F) PRIMER_F="${OPTARG}" + ;; + R) PRIMER_R="${OPTARG}" + ;; + f) PRIMER_F_RC="${OPTARG}" + ;; + r) PRIMER_R_RC="${OPTARG}" + ;; + e) EE="${OPTARG}" + ;; + p) PREFIX="${OPTARG}" + ;; + d) DIR="${OPTARG}" + ;; + l) MIN_LEN="${OPTARG}" + ;; + h | *) usage + exit 0 + ;; + \?) echo "Invalid option: -$OPTARG" + exit 1 + ;; + : ) echo "Missing argument for -${OPTARG}" >&2 + exit 1 + ;; + esac +done + +module --force purge +module load StdEnv +module load cutadapt/2.10-GCCcore-9.3.0-Python-3.8.2 + +mkdir -p clip_out +mkdir -p clip_out/tag_clip +for file in "${DIR}"/*_LA.fq; do LA="$(basename $file _LA.fq)"; cutadapt -j 8 --quiet -e $EE -a file:Tags_LA1.fa -a file:Tags_LA2.fa -a file:Tags_LA3.fa --action=trim --trim-n --max-n 0 --minimum-length $MIN_LEN -o ${LA}_trim1.fq $file; done; + +for f in *_trim1.fq; do trim1="$(basename $f _trim1.fq)"; cutadapt -j 8 --quiet -e $EE -a "${PRIMER_F}...${PRIMER_R_RC}" -O $MIN_F -a "${PRIMER_R}...${PRIMER_F_RC}" -O $MIN_R --action=trim --minimum-length $MIN_LEN -o ${trim1}_trim2.fq $f; done; +mv *_trim1.fq clip_out/tag_clip + +module --force purge +module load StdEnv +module load VSEARCH/2.13.4-iccifort-2019.1.144-GCC-8.2.0-2.31.1 + +mkdir -p clip_out/primer_clip +# Discard sequences containing Ns, add expected error rates and convert to fasta +for j in *_trim2.fq; do trim2="$(basename $j _trim2.fq)"; vsearch --quiet --fastq_filter $j --relabel_sha1 --fastq_qmax 45 --eeout --fastq_maxns 0 --fastqout ${trim2}_trim3.fq; done; +mv *_trim2.fq clip_out/primer_clip + +mkdir -p clip_out/trimming +for k in *_trim3.fq; do trim3a="$(basename $k _trim3.fq)"; vsearch --quiet --fastq_filter $k --xee --fastaout ${trim3a}_trim3.fasta; done; + +#quality file +# Discard quality lines, extract hash, expected error rates and read length +QUALITY_FILE=${PREFIX}.qual +TMP_QUAL=$(mktemp --tmpdir=".") +for q in *_trim3.fq; do \ + sed 'n;n;N;d' $q | \ + awk 'BEGIN {FS = "[;=]"} + {if (/^@/) {printf "%s\t%s\t", $1, $3} else {print length($1)}}' | \ + tr -d "@" >> "${TMP_QUAL}" +done < "${TAGS}" +# Produce the final quality file +sort -k3,3n -k1,1d -k2,2n $TMP_QUAL | \ + uniq --check-chars=40 > "${QUALITY_FILE}" +rm $TMP_QUAL +mv *_trim3.fq clip_out/trimming + +mkdir dereplicated +for p in *_trim3.fasta; do trim3b="$(basename $p _trim3.fasta)"; vsearch --quiet --derep_fulllength $p --sizeout --fasta_width 0 --output ${trim3b}_dp.fasta; done; +rm *_trim3.fasta +mv *_dp.fasta dereplicated + + + From 9f6a30f0d7e045ad6737d34b375650dd5bc25965 Mon Sep 17 00:00:00 2001 From: Marcele Laux Date: Tue, 21 Feb 2023 11:51:19 -0300 Subject: [PATCH 13/13] updated executable ./3_tag_primer_clipping_clean_derep.sh -h --- .../3_tag_primer_clipping_clean_derep.sh | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 4_dereplicate/3_tag_primer_clipping_clean_derep.sh diff --git a/4_dereplicate/3_tag_primer_clipping_clean_derep.sh b/4_dereplicate/3_tag_primer_clipping_clean_derep.sh new file mode 100644 index 0000000..2798bb0 --- /dev/null +++ b/4_dereplicate/3_tag_primer_clipping_clean_derep.sh @@ -0,0 +1,117 @@ +#/bin/bash + +TAGS='' +PRIMER_F='' +PRIMER_R='' +PREFIX='' +DIR='' +PRIMER_F_RC='' +PRIMER_R_RC='' +MIN_F=$(( ${#PRIMER_F} * 2 / 3 )) +MIN_R=$(( ${#PRIMER_R} * 2 / 3 )) +MIN_LEN='' +EE='' + +usage () { + echo "##################################################" + echo "TAGS AND PRIMERS CLIPPING" + echo "##################################################" + echo "get your primer's reverse complement in this website:" + echo "http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html" + echo " " + echo "Usage: ${0} [-b tags file] [-F 5' primer 'Forward'] [-R 3' primer 'Reverse'] [-f revcom 5' primer] [-r revcom 3' primer] [-e 0.1] [-p prefix] [-l amplicon min length] [-d path to demultiplexed samples]" + echo "-b barcodes/tags mapping file (used in demultiplexing step)" + echo "-F 5' primer (forward direction, like in your primers design file)" + echo "-R 3' primer (forward direction)" + echo "-f 5' primer reverse complement" + echo "-r 3' primer reverse complement" + echo "-e maximum error rate (the same used for demultiplexing)" + echo "Error rate: allowed errors/adapter or primer length. 2 errors in a 12 bp tag = 2/12 = 0.17" + echo "4 errors in a 26bp degenerated primer = 4/26 = 0.16, 2 errors in a 20bp primer = 0.1 (Cutadapt default)." + echo "-p PREFIX to quality file (it MUST be the SAME prefix throughout the ENTIRE PIPELINE)" + echo "-l minimum length for the final amplicon" + echo "-d path to demultiplexed samples directory. Use pwd." + echo "-h print this help" + echo " " + echo "##################################################" + 1>&2; exit 1; + +} +while getopts "b:F:R:f:r:e:p:d:l:h" option; do + case $option in + b) TAGS="${OPTARG}" + ;; + F) PRIMER_F="${OPTARG}" + ;; + R) PRIMER_R="${OPTARG}" + ;; + f) PRIMER_F_RC="${OPTARG}" + ;; + r) PRIMER_R_RC="${OPTARG}" + ;; + e) EE="${OPTARG}" + ;; + p) PREFIX="${OPTARG}" + ;; + d) DIR="${OPTARG}" + ;; + l) MIN_LEN="${OPTARG}" + ;; + h | *) usage + exit 0 + ;; + \?) echo "Invalid option: -$OPTARG" + exit 1 + ;; + : ) echo "Missing argument for -${OPTARG}" >&2 + exit 1 + ;; + esac +done + +module --force purge +module load StdEnv +module load cutadapt/2.10-GCCcore-9.3.0-Python-3.8.2 + +mkdir -p clip_out +mkdir -p clip_out/tag_clip +for file in "${DIR}"/*_LA.fq; do LA="$(basename $file _LA.fq)"; cutadapt -j 8 --quiet -e $EE -a file:Tags_LA1.fa -a file:Tags_LA2.fa -a file:Tags_LA3.fa --action=trim --trim-n --max-n 0 --minimum-length $MIN_LEN -o ${LA}_trim1.fq $file; done; + +for f in *_trim1.fq; do trim1="$(basename $f _trim1.fq)"; cutadapt -j 8 --quiet -e $EE -a "${PRIMER_F}...${PRIMER_R_RC}" -O $MIN_F -a "${PRIMER_R}...${PRIMER_F_RC}" -O $MIN_R --action=trim --minimum-length $MIN_LEN -o ${trim1}_trim2.fq $f; done; +mv *_trim1.fq clip_out/tag_clip + +module --force purge +module load StdEnv +module load VSEARCH/2.13.4-iccifort-2019.1.144-GCC-8.2.0-2.31.1 + +mkdir -p clip_out/primer_clip +# Discard sequences containing Ns, add expected error rates and convert to fasta +for j in *_trim2.fq; do trim2="$(basename $j _trim2.fq)"; vsearch --quiet --fastq_filter $j --relabel_sha1 --fastq_qmax 45 --eeout --fastq_maxns 0 --fastqout ${trim2}_trim3.fq; done; +mv *_trim2.fq clip_out/primer_clip + +mkdir -p clip_out/trimming +for k in *_trim3.fq; do trim3a="$(basename $k _trim3.fq)"; vsearch --quiet --fastq_filter $k --xee --fastaout ${trim3a}_trim3.fasta; done; + +#quality file +# Discard quality lines, extract hash, expected error rates and read length +QUALITY_FILE=${PREFIX}.qual +TMP_QUAL=$(mktemp --tmpdir=".") +for q in *_trim3.fq; do \ + sed 'n;n;N;d' $q | \ + awk 'BEGIN {FS = "[;=]"} + {if (/^@/) {printf "%s\t%s\t", $1, $3} else {print length($1)}}' | \ + tr -d "@" >> "${TMP_QUAL}" +done < "${TAGS}" +# Produce the final quality file +sort -k3,3n -k1,1d -k2,2n $TMP_QUAL | \ + uniq --check-chars=40 > "${QUALITY_FILE}" +rm $TMP_QUAL +mv *_trim3.fq clip_out/trimming + +mkdir dereplicated +for p in *_trim3.fasta; do trim3b="$(basename $p _trim3.fasta)"; vsearch --quiet --derep_fulllength $p --sizeout --fasta_width 0 --output ${trim3b}_dp.fasta; done; +rm *_trim3.fasta +mv *_dp.fasta dereplicated + + +