Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
tallnuttrbgv authored May 27, 2024
1 parent 27c82d2 commit e6dcde3
Showing 1 changed file with 177 additions and 0 deletions.
177 changes: 177 additions & 0 deletions kmercount-shannons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#!/usr/bin/env python3

#theo allnutt 2021
#see r10.2 for notes
#outputs Shannon's and kmer count entropy measures for DNA sequence file or files
#usage:
#kmercount-shannons.py "input-folder/*" output-folder/ <kmerlen> <threads>
#kmercount-shannons.py "mega353.fasta" ./ 8 1
#kmercount-shannons.py "reads/*.fasta" kmercounts/ 8 12
#kmercount-shannons.py "reads/*.fastq.gz" kmercounts/ 4 12


import sys
import os
import re
import glob
import subprocess as sp
import concurrent.futures
from Bio import SeqIO
import gzip
import math
import collections


def tokenize(filename):
digits = re.compile(r'(\d+)')
return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename)))

def estimate_shannon_entropy(dna_sequence):

seq=dna_sequence.upper()
if len(seq)-seq.count("N") <=0:
a=0
t=0
c=0
g=0
else:
a=float(seq.count("A")/(len(seq)-seq.count("N")))
t=float(seq.count("T")/(len(seq)-seq.count("N")))
c=float(seq.count("C")/(len(seq)-seq.count("N")))
g=float(seq.count("G")/(len(seq)-seq.count("N")))

if a!=0:
a1=a * math.log(a,4)
else:
a1=0
if t!=0:
t1=t * math.log(t,4)
else:
t1=0
if c!=0:
c1=c * math.log(c,4)
else:
c1=0
if g!=0:
g1=g * math.log(g,4)
else:
g1=0


entropy_value = -1 * ( a1 + t1 + c1 + g1 )

#print(a,t,c,g, entropy_value)

return entropy_value

def kmercount(dna_sequence,wordsize):

wordsizeint=int(wordsize)
kmers=[]
seqlen=float(len(dna_sequence))
wordsize=float(wordsize)


#n.b. this posscount is modelled for shorter sequences where b^len count is not true
posscount=float((4**wordsize)*(1-math.exp(-(1/(4**wordsize))*seqlen)))

#use infinate len seq posscount
#posscount=4**wordsize

#print(posscount)
for x in range(len(dna_sequence)-wordsizeint):

kbit=dna_sequence[x:x+wordsizeint]
if "N" not in kbit:

#if kbit not in kmers:
#print(x,kbit)
kmers.append(kbit) #also try if to test before set see which is faster:

#if len(kmers)>=posscount:
#break
kmers=set(kmers)

kratio=float(len(kmers)/posscount)
#kratio is ratio of kmer count to maximum possilbe count for the sequence length
return (len(kmers),kratio)


def shan(i,wordn):

name0=i.split("/")[-2]
name1=i.split("/")[-1]
name1=name0+"_"+name1
#print(name1)
ext1=name1.split(".")[-1]

if ext1=="gz":
#print("gz detected")
f=gzip.open(i,'rt')
k=name1.split(".")[-2]
name1=name1[:-3]
else:
f=open(i,'r')
k=ext1

if k[-1]=="a":
fmt="fasta"
#print("fasta")
if k[-1]=="q":
fmt="fastq"
#print("fastq")
c=0

st=0
outfile=open(outfolder+"/"+name1+".shan",'w')
outfile.write("name\tbp\tShannon's\tkmer_count\tkmer_ratio\n")

for x in SeqIO.parse(f,fmt):
c=c+1

if c<=sample:

shannon = estimate_shannon_entropy(str(x.seq))
kcount=kmercount(str(x.seq),wordn)

outfile.write(str(x.description)+"\t"+str(len(x.seq))+"\t"+str(shannon)+"\t"+str(kcount[0])+"\t"+str(kcount[1])+"\n")

#print(x.id,'len=',len(x.seq),'S=',shannon,'nK=',kcount[0],'rK=',kcount[1])
st=st+shannon
else:
break

shanmean=float(st/c)

print(i,c,"reads",shanmean,"mean Shannons")

outfile.close()

#global

folder=sys.argv[1]
filelist=glob.glob(folder)
filelist.sort(key=tokenize)
print(filelist)
outfolder = sys.argv[2] #outfolder
wordn=int(sys.argv[3])
sample=int(sys.argv[4])
threads=int(sys.argv[5])

if __name__ == '__main__':

#executor = concurrent.futures.ProcessPoolExecutor(threads)
#futures = [executor.submit(shan, i,wordn) for i in filelist]
#concurrent.futures.wait(futures)



for i in filelist:
shan(i,wordn)

#print(data)





0 comments on commit e6dcde3

Please sign in to comment.