-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshannons-stream-filter.py
135 lines (96 loc) · 2.74 KB
/
shannons-stream-filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
#theo allnutt 2021
#r3.7 calculates Shannon's information index for sample of reads, only outputs reads over threshold.
#usage
#shannons-folder.py "reads/*" filtered/ 0.8 12
#filtered/=output folder, 0.8= threshold, 12=threads
#accepts fasta, fastq and .gz
#n.b. for word size see:
#Akhter, S., Bailey, B., Salamon, P. et al. Applying Shannon's information theory to bacterial and phage genomes and metagenomes. Sci Rep 3, 1033 (2013). https://doi.org/10.1038/srep01033
import sys
import os
import re
import glob
import subprocess as sp
#import multiprocessing
import concurrent.futures
from Bio import SeqIO
import gzip
import math
import collections
from multiprocessing import Manager
digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename)))
def estimate_shannon_entropy(dna_sequence):
seq=dna_sequence.upper()
if len(seq)-seq.count("N") <=0:
a=0
t=0
c=0
g=0
else:
a=float(seq.count("A")/(len(seq)-seq.count("N")))
t=float(seq.count("T")/(len(seq)-seq.count("N")))
c=float(seq.count("C")/(len(seq)-seq.count("N")))
g=float(seq.count("G")/(len(seq)-seq.count("N")))
if a!=0:
a1=a * math.log(a,4)
else:
a1=0
if t!=0:
t1=t * math.log(t,4)
else:
t1=0
if c!=0:
c1=c * math.log(c,4)
else:
c1=0
if g!=0:
g1=g * math.log(g,4)
else:
g1=0
entropy_value = -1 * ( a1 + t1 + c1 + g1 )
#print(a,t,c,g, entropy_value)
return entropy_value
def shan(i,chunk):
name1=i.split("/")[-1]
print(name1)
f=open(i,'r')
outfile=open(outfolder+"/"+name1,'w')
n=0
c=0
for x in SeqIO.parse(f,'fasta'):
slen=len(x.seq)
#remainder=1+int(chunk*float("."+str(float(slen/chunk)).split(".")[-1]))
goodseq=""
for y in range(0,slen,chunk):
n=n+1
if y < slen-chunk:
checkseq=str(x.seq)[y:y+256]
shannon = estimate_shannon_entropy(checkseq)
else:
checkseq=str(x.seq)[y:]
shannon = estimate_shannon_entropy(checkseq)
if shannon >=shan_limit:
c=c+1
goodseq=goodseq+checkseq
#print(shannon,checkseq)
#print(x.id,goodseq)
outfile.write(">"+str(x.description)+"\n"+str(goodseq)+"\n")
print(i,"chunks=",n,"passed=",c,"%passed=",float((c/n)*100))
outfile.close()
folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
print(filelist)
outfolder = sys.argv[2] #outfolder
shan_limit=float(sys.argv[3])
chunk=int(sys.argv[4])
threads=int(sys.argv[5])
if __name__ == '__main__':
executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i,chunk) for i in filelist]
concurrent.futures.wait(futures)
#for i in filelist:
#shan(i,chunk)