-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounter.nf
192 lines (158 loc) · 6.46 KB
/
counter.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env nextflow
// Made by: Igor Trujnara, Alessio Vignoli
nextflow.enable.dsl = 2
params.help = false
// this prints the help in case you use --help parameter in the command line and it stops the pipeline
if (params.help) {
log.info "This pipeline computes kmers of length specified by the --kmer flag. "
log.info "The input is two types of files: one FASTA file given with the --fasta option and one Phobius prediction file given with --pred."
log.info "The prediction file is a direct stdout redirection of Phobius with -short."
log.info "This means that for each FASTA file each sequence is found in the associated prediction file."
log.info "The order of FASTA and prediction files does not matter."
log.info ""
log.info "--pairing flag, when given, no matter the content, will always use the non-canonical pairing algorithm (explained afterwards)."
log.info "--pairing \"standard\" is the only case when the flag is explicitly mentioned, and in this case the canonical pairing algorithm is used."
log.info "Not mentioning the pairing flag will use the canonical pairing scheme."
log.info "Canonical pairing scheme:"
log.info "The glob pattern in the --pred flag has to encapsulate also the FASTA files."
log.info "e.g. predfile = sample1.txt, sample2.txt fastafile = sample1.fa, sample2.fa"
log.info "In this case the --pred flag will be --pred \"*.txt\" "
log.info "Since the names are identical, the pairs generated will be sample1.txt with sample1.fa etc."
log.info "WARNING: .txt and .fa must be used in this case. File names cannot contain other dots."
log.info "Non-canonical pairing scheme:"
log.info "In this case the pred file and the FASTA file name can be whatever. The only thing that links them is the content matched by the asterisk."
log.info "e.g. predfile = sample1.txt, sample2.txt fastafile = sample_1.fasta, sample_2.fasta"
log.info "In this case, the flags will be --pred \"sample*.txt\" --fasta \"sample_*.fasta\""
log.info "WARNING: Only one asterisk is recognized as glob pattern."
log.info ""
log.info "The last flag is --feature. It controls which type of phobius prediction feature is extracted. Possible ones are:"
log.info "default is s for special helix, all the possible are: [c, i, o, -, n, s, l] c = signal peptide, i = inside membrane(cytoplasm),"
log.info "o = outside membrane, - = helix (in phobius originalmodel), (only in phobius-M7or later) => -n- = normal-helix"
log.info "s- = special-helix and -l- = loop-inramembrane"
log.info "they have to be given to the pipeline in one letter code"
}
// params section
params.kmer = 3
params.feature = "s"
params.pairing = "standard"
params.fasta = "data/sequence.fa"
params.pred = "data/prediction.txt"
params.outdir = "results/"
params.CONTAINER = "python:slim-buster@sha256:fe2971bedd019d952d4458afb1fe4e222ddb810150008c1dee5a068d38bb0e43"
// include section
include { ch_pairer } from "${params.PIPES}channel_pairer.nf"
include { pairer } from "${params.PIPES}input_files_pairer.nf"
/*
process findSequences {
input:
path pyscript
path pred_file
val feature_id
output:
path "seq_ranges_*.txt", emit: ranges
stdout emit: standardout // debug porpouses
script:
suffix = "seq_ranges_${pred_file.getFileName()}".split('\\.')[0] + ".txt"
"""
python3 ${pyscript} ${pred_file} ${suffix} ${feature_id}
"""
}
process extractSequences {
input:
path pyscript
path ranges
path fasta
output:
//path "seqs_*.txt"
stdout emit: standardout // debug porpouses
script:
suffix = "${ranges.getFileName()}".split('\\.')[0] + "__" + "${fasta.getFileName()}".split('\\.')[0] + ".txt"
"""
python3 ${pyscript} ${ranges} ${fasta} ${suffix}
"""
}
*/
process findAndExtractSeq {
container params.CONTAINER
input:
path findscript
path extrscript
tuple val(matcher), path(predFile), path(fastaFile)
val featureID
output:
path "seqs_pair_*.fa", emit: sub_seqs
stdout emit: standardout
script:
suffix = "seqs_${matcher}.txt"
prefix = "${predFile}".split("${matcher}")[0].split('\\.')[0] // used later on at the final step for the output name
"""
python3 ${findscript} ${predFile} ${suffix} ${featureID}
python3 ${extrscript} ${suffix} ${fastaFile} "seqs_pair_${matcher}.fa"
echo ${prefix}
"""
}
process countKmers {
container params.CONTAINER
input:
path pyscript
path seqs
val kmer
output:
path "seq_kmers_*.txt", emit: kmers_counts
stdout emit: standardout // debug porpouses
script:
suffix = "${seqs.getFileName()}".split("seqs_pair_")[1].split('\\.')[0] + ".txt"
"""
python3 ${pyscript} ${seqs} "seq_kmers_${suffix}" ${kmer}
"""
}
process sumKmers {
container params.CONTAINER
publishDir params.outdir, mode: "move", overwrite: false
scratch true
input:
path pyscript
path kmers
val suffix
output:
path "total_kmers_*", emit: tot_kemers
stdout emit: standardout
script:
final_name = "total_kmers_" + "${suffix}_".replaceAll("\n", "") + "${kmers}".split("seq_kmers_")[1].replaceAll("bubba","")
"""
for i in `echo ${kmers}`; do cat \$i >> TMP; done
python3 ${pyscript} TMP ${final_name}
"""
}
workflow countHelixKmers {
take:
kmer
featName
prediction
fasta
main:
// script finder
findscript = params.SCRIPTS + "find_seqs.py"
extrscript = params.SCRIPTS + "extract_seqs.py"
countscript = params.SCRIPTS + "kmer_count.py"
sumscript = params.SCRIPTS + "sum_kmers.py"
// channel definitions
pairedInputs = ""
if(params.pairing == "standard"){
pairedInputs = Channel.fromFilePairs( prediction + ".{txt,fa}").map{ [it[0], it[1][1], it[1][0]] } // not to have list of lists and fa comes before txt
} else {
pairer(prediction, fasta)
pairedInputs = pairer.out.right_pairs
}
findAndExtractSeq(findscript, extrscript, pairedInputs, featName)
countKmers(countscript, findAndExtractSeq.out.sub_seqs, kmer)
sumKmers(sumscript, countKmers.out.kmers_counts.collect(), findAndExtractSeq.out.standardout.last())
emit:
final_out = sumKmers.out.tot_kemers
//stdout = sumKmers.out.standardout // for debug porpouses
}
workflow {
countHelixKmers(params.kmer, params.feature, params.pred, params.fasta)
countHelixKmers.out.final_out.view()
//countHelixKmers.out.stdout.view() // for debug porpouses
}