Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Filters DNA sequences (reads) by Shannon's information index. Word size is whole read length.
  • Loading branch information
tallnuttrbgv authored Jan 31, 2022
1 parent a9cd4d5 commit 64845eb
Showing 1 changed file with 135 additions and 0 deletions.
135 changes: 135 additions & 0 deletions shannons-stream-filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env python3

#theo allnutt 2021
#r3.7 calculates Shannon's information index for sample of reads, only outputs reads over threshold.
#usage
#shannons-folder.py "reads/*" filtered/ 0.8 12
#filtered/=output folder, 0.8= threshold, 12=threads
#accepts fasta, fastq and .gz
#n.b. for word size see:
#Akhter, S., Bailey, B., Salamon, P. et al. Applying Shannon's information theory to bacterial and phage genomes and metagenomes. Sci Rep 3, 1033 (2013). https://doi.org/10.1038/srep01033



import sys
import os
import re
import glob
import subprocess as sp
#import multiprocessing
import concurrent.futures
from Bio import SeqIO
import gzip
import math
import collections
from multiprocessing import Manager

digits = re.compile(r'(\d+)')
def tokenize(filename):
return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename)))

def estimate_shannon_entropy(dna_sequence):

seq=dna_sequence.upper()
if len(seq)-seq.count("N") <=0:
a=0
t=0
c=0
g=0
else:
a=float(seq.count("A")/(len(seq))) #-seq.count("N")
t=float(seq.count("T")/(len(seq)))
c=float(seq.count("C")/(len(seq)))
g=float(seq.count("G")/(len(seq)))

if a!=0:
a1=a * math.log(a,4)
else:
a1=0
if t!=0:
t1=t * math.log(t,4)
else:
t1=0
if c!=0:
c1=c * math.log(c,4)
else:
c1=0
if g!=0:
g1=g * math.log(g,4)
else:
g1=0


entropy_value = -1 * ( a1 + t1 + c1 + g1 )

#print(a,t,c,g, entropy_value)

return entropy_value

def shan(i,chunk):

name1=i.split("/")[-1]
print(name1)

f=open(i,'r')
outfile=open(outfolder+"/"+name1,'w')

n=0
c=0
for x in SeqIO.parse(f,'fasta'):

slen=len(x.seq)

#remainder=1+int(chunk*float("."+str(float(slen/chunk)).split(".")[-1]))
goodseq=""

for y in range(0,slen,chunk):
n=n+1

if y < slen-chunk:
checkseq=str(x.seq)[y:y+256]
shannon = estimate_shannon_entropy(checkseq)

else:
checkseq=str(x.seq)[y:]
shannon = estimate_shannon_entropy(checkseq)


if shannon >=shan_limit:
c=c+1
goodseq=goodseq+checkseq

#print(shannon,checkseq)

#print(x.id,goodseq)


outfile.write(">"+str(x.description)+"\n"+str(goodseq)+"\n")

print(i,"chunks=",n,"passed=",c,"%passed=",float((c/n)*100))

outfile.close()


folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
print(filelist)
outfolder = sys.argv[2] #outfolder
shan_limit=float(sys.argv[3])

chunk=int(sys.argv[4])
threads=int(sys.argv[5])

if __name__ == '__main__':


executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i,chunk) for i in filelist]
concurrent.futures.wait(futures)

#for i in filelist:
#shan(i,chunk)



0 comments on commit 64845eb

Please sign in to comment.