Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
#outputs Shannon's and kmer count entropy measures for DNA sequence file or files
#usage:
#kmercount-byread.py "input-folder/*" output-folder/ <kmerlen> <threads>
#kmercount-byread.py "mega353.fasta" ./ 8 1
#kmercount-byread.py "reads/*.fasta" kmercounts/ 8 12
#kmercount-byread.py "reads/*.fastq.gz" kmercounts/ 4 12
  • Loading branch information
tallnuttrbgv authored Jan 31, 2022
1 parent 857d5c1 commit 6b88b56
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions kmercount-byread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3

#theo allnutt 2021
#outputs Shannon's and kmer count entropy measures for DNA sequence file or files
#usage:
#kmercount-byread.py "input-folder/*" output-folder/ <kmerlen> <threads>
#kmercount-byread.py "mega353.fasta" ./ 8 1
#kmercount-byread.py "reads/*.fasta" kmercounts/ 8 12
#kmercount-byread.py "reads/*.fastq.gz" kmercounts/ 4 12


import sys
import os
import re
import glob
import subprocess as sp
import concurrent.futures
from Bio import SeqIO
import gzip
import math
import collections





def tokenize(filename):
digits = re.compile(r'(\d+)')
return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename)))

def estimate_shannon_entropy(dna_sequence):

seq=dna_sequence.upper()
if len(seq)-seq.count("N") <=0:
a=0
t=0
c=0
g=0
else:
a=float(seq.count("A")/(len(seq)-seq.count("N")))
t=float(seq.count("T")/(len(seq)-seq.count("N")))
c=float(seq.count("C")/(len(seq)-seq.count("N")))
g=float(seq.count("G")/(len(seq)-seq.count("N")))

if a!=0:
a1=a * math.log(a,4)
else:
a1=0
if t!=0:
t1=t * math.log(t,4)
else:
t1=0
if c!=0:
c1=c * math.log(c,4)
else:
c1=0
if g!=0:
g1=g * math.log(g,4)
else:
g1=0


entropy_value = -1 * ( a1 + t1 + c1 + g1 )

#print(a,t,c,g, entropy_value)

return entropy_value

def kmercount(dna_sequence,wordsize):

wordsizeint=int(wordsize)
kmers=[]
seqlen=float(len(dna_sequence))
wordsize=float(wordsize)



posscount=float((4**wordsize)*(1-math.exp(-(1/(4**wordsize))*seqlen)))
#print(posscount)
for x in range(len(dna_sequence)-wordsizeint):

kbit=dna_sequence[x:x+wordsizeint]
if "N" not in kbit:

#if kbit not in kmers:
#print(x,kbit)
kmers.append(kbit) #also try if to test before set see which is faster:

#if len(kmers)>=posscount:
#break
kmers=set(kmers)

kratio=float(len(kmers)/posscount)
#kratio is ratio of kmer count to maximum possilbe count for the sequence length
return (len(kmers),kratio)


def shan(i,wordn):

name1=i.split("/")[-1]
print(name1)
ext1=name1.split(".")[-1]

if ext1=="gz":
#print("gz detected")
f=gzip.open(i,'rt')
k=name1.split(".")[-2]
name1=name1[:-3]
else:
f=open(i,'r')
k=ext1

if k[-1]=="a":
fmt="fasta"
#print("fasta")
if k[-1]=="q":
fmt="fastq"
#print("fastq")
c=0

st=0
outfile=open(outfolder+"/"+name1+".shan",'w')
outfile.write("name\tbp\tShannon's\tkmer_count\tkmer_ratio\n")
for x in SeqIO.parse(f,fmt):
c=c+1

shannon = estimate_shannon_entropy(str(x.seq))
kcount=kmercount(str(x.seq),wordn)
outfile.write(str(x.description)+"\t"+str(len(x.seq))+"\t"+str(shannon)+"\t"+str(kcount[0])+"\t"+str(kcount[1])+"\n")
print(x.id,'len=',len(x.seq),'S=',shannon,'nK=',kcount[0],'rK=',kcount[1])
st=st+shannon
shanmean=float(st/c)

print(i,c,"reads",shanmean,"mean Shannons")

outfile.close()



folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
print(filelist)
outfolder = sys.argv[2] #outfolder
wordn=int(sys.argv[3])
threads=int(sys.argv[4])


if __name__ == '__main__':

executor = concurrent.futures.ProcessPoolExecutor(threads)
futures = [executor.submit(shan, i,wordn) for i in filelist]
concurrent.futures.wait(futures)



#for i in filelist:
#shan(i,wordn)

#print(data)





0 comments on commit 6b88b56

Please sign in to comment.