forked from gaschlabgit/RNA-Seq-Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove3primeAFastq.py
executable file
·126 lines (94 loc) · 3.54 KB
/
remove3primeAFastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/home/GLBRCORG/mplace/anaconda3/bin/python
"""
@Program: remove3primeAFastq.py
@Purpose: Remove all consecutive 3 prime A's from fastq file, also trims quality scores.
Remove all consecutive 3 prime A's from fastq file where pattern is something like:
nnnnnnnnnnnnnnnAAAAAAAGATCGGAAGAGC
@Input: Input file listing one fastq file per line, unzipped and the number of A's to search for
@Dependencies: Python 3
@Output: Fastq file with 3 prime A's removed
@author: Mike Place
@Date: 2/5/2015
Example:
~/scripts/remove3primeAFastq.py <listfile> <num A>
ORIGINAL:
@HWI-D00256:264:HBC0UADXX:2:1101:11620:2235 1:N:0:CGTGAT
GGGGCAGCTTTCAGATTGTACTAAGCATAATTACGTGTTTTCATAGTTTAACGCTTTCAGAACTACTTATTTAATTTTGTAAGAAGTAATTTGAGTCACAT
+
?@@?D@@;CFBFDD<CBC<<<AA<GGHEGBHIIDG1???FGBBDGGIGGGED?@GDFFHCGDHGIGGCDEEEEC?CHHEHHB>?DE@@ADEE@C@CCCCAC
@HWI-D00256:264:HBC0UADXX:2:1101:14412:2175 1:N:0:CGTGAT
GGGATGTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+
CCCFFFFFHHHHHJJJJJJJJJJHFDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
TRAILING A's removed:
@HWI-D00256:264:HBC0UADXX:2:1101:11620:2235 1:N:0:CGTGAT
GGGGCAGCTTTCAGATTGTACTAAGCATAATTACGTGTTTTCATAGTTTAACGCTTTCAGAACTACTTATTTAATTTTGTAAGAAGTAATTTGAGTCACAT
+
?@@?D@@;CFBFDD<CBC<<<AA<GGHEGBHIIDG1???FGBBDGGIGGGED?@GDFFHCGDHGIGGCDEEEEC?CHHEHHB>?DE@@ADEE@C@CCCCAC
@HWI-D00256:264:HBC0UADXX:2:1101:14412:2175 1:N:0:CGTGAT
GGGATGTTT -- -- SEE THE A's have been removed
+
CCCFFFFFH -- Quality score also adjusted
"""
import itertools
import re
import sys
def stripA( dna, qual ):
"""
strip A's from the end of string
"""
m = re.search("A+$", dna)
if m:
start = m.start()
trimA = dna[:start]
trimQ = qual[:start]
return trimA, trimQ
else:
return dna, qual
def stripAplusEnd( dna, qual, numA ):
"""
strip a pattern like: NNNNNN.....AAAAAAAAGATCGGAAGA from 3' end of seq
"""
#****** CHANGE THE DESIRED NUMBER OF A's HERE
exp = r"[A]{%d,}[GATC]{5,}$" %(numA)
m = re.search(exp, dna)
if m:
start = m.start()
trimA = dna[:start]
trimQ = qual[:start]
return trimA, trimQ
else:
return dna, qual
def main():
"""
main()
"""
# if no args print help
if len(sys.argv) <= 2:
print("Input file and Number of A's to remove are required.")
sys.exit(1)
else:
inFile = sys.argv[1]
numA = int(sys.argv[2])
fastqList = []
with open( inFile, 'r') as f:
for line in f:
line = line.rstrip()
fastqList.append(line)
for item in fastqList:
removeAFastq = item.replace("fastq", "rmA.fastq")
with open(removeAFastq, "w") as out:
with open(item) as f:
for line1, line2, line3, line4 in itertools.zip_longest(*[f]*4):
header = line1.rstrip()
dna = line2.rstrip()
qual = line4.rstrip()
dna_A, qual_A = stripA(dna, qual) # strip A's first
dna_End, qual_End = stripAplusEnd(dna_A, qual_A, numA) # strip AAAGATC... ends
out.write("%s\n" %(header) )
out.write("%s\n" %(dna_End) )
out.write("+\n")
out.write("%s\n" %(qual_End))
out.close()
if __name__ == "__main__":
main()