Add files via upload

Royal-Botanic-Gardens-Victoria · May 27, 2024 · e6dcde3 · e6dcde3
1 parent 27c82d2
commit e6dcde3
Showing 1 changed file with 177 additions and 0 deletions.
diff --git a/kmercount-shannons.py b/kmercount-shannons.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+
+#theo allnutt 2021
+#see r10.2 for notes
+#outputs Shannon's and kmer count entropy measures for DNA sequence file or files
+#usage:
+#kmercount-shannons.py "input-folder/*" output-folder/ <kmerlen> <threads>
+#kmercount-shannons.py "mega353.fasta" ./ 8 1
+#kmercount-shannons.py "reads/*.fasta" kmercounts/ 8 12
+#kmercount-shannons.py "reads/*.fastq.gz" kmercounts/ 4 12
+
+
+import sys
+import os
+import re
+import glob
+import subprocess as sp
+import concurrent.futures
+from Bio import SeqIO
+import gzip
+import math
+import collections
+
+
+def tokenize(filename):
+	digits = re.compile(r'(\d+)')
+	return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename)))
+
+def estimate_shannon_entropy(dna_sequence):
+
+	seq=dna_sequence.upper()
+	if len(seq)-seq.count("N") <=0:
+		a=0
+		t=0
+		c=0
+		g=0
+	else:
+		a=float(seq.count("A")/(len(seq)-seq.count("N")))
+		t=float(seq.count("T")/(len(seq)-seq.count("N")))
+		c=float(seq.count("C")/(len(seq)-seq.count("N")))
+		g=float(seq.count("G")/(len(seq)-seq.count("N")))
+
+	if a!=0:
+		a1=a * math.log(a,4)
+	else: 
+		a1=0
+	if t!=0:
+		t1=t * math.log(t,4)
+	else: 
+		t1=0
+	if c!=0:
+		c1=c * math.log(c,4)
+	else: 
+		c1=0
+	if g!=0:
+		g1=g * math.log(g,4)
+	else: 
+		g1=0		
+
+
+	entropy_value = -1 * ( a1 + t1 + c1 + g1 )
+
+	#print(a,t,c,g, entropy_value)
+
+	return entropy_value
+
+def kmercount(dna_sequence,wordsize):
+
+	wordsizeint=int(wordsize)
+	kmers=[]
+	seqlen=float(len(dna_sequence))
+	wordsize=float(wordsize)
+
+
+	#n.b. this posscount is modelled for shorter sequences where b^len count is not true
+	posscount=float((4**wordsize)*(1-math.exp(-(1/(4**wordsize))*seqlen)))
+
+	#use infinate len seq posscount
+	#posscount=4**wordsize
+
+	#print(posscount)
+	for x in range(len(dna_sequence)-wordsizeint):
+
+		kbit=dna_sequence[x:x+wordsizeint]
+		if "N" not in kbit:
+
+		#if kbit not in kmers:
+			#print(x,kbit)
+			kmers.append(kbit) #also try if to test before set see which is faster: 
+
+		#if len(kmers)>=posscount:
+			#break
+	kmers=set(kmers)
+
+	kratio=float(len(kmers)/posscount)
+	#kratio is ratio of kmer count to maximum possilbe count for the sequence length
+	return (len(kmers),kratio)
+
+
+def shan(i,wordn):
+
+	name0=i.split("/")[-2]
+	name1=i.split("/")[-1]
+	name1=name0+"_"+name1
+	#print(name1)
+	ext1=name1.split(".")[-1]
+
+	if ext1=="gz":
+		#print("gz detected")
+		f=gzip.open(i,'rt')
+		k=name1.split(".")[-2]
+		name1=name1[:-3]
+	else:
+		f=open(i,'r')
+		k=ext1
+
+	if k[-1]=="a":
+		fmt="fasta"
+		#print("fasta")
+	if k[-1]=="q":
+		fmt="fastq"
+		#print("fastq")
+	c=0
+
+	st=0
+	outfile=open(outfolder+"/"+name1+".shan",'w')
+	outfile.write("name\tbp\tShannon's\tkmer_count\tkmer_ratio\n")
+
+	for x in SeqIO.parse(f,fmt):
+		c=c+1
+
+		if c<=sample:
+
+			shannon = estimate_shannon_entropy(str(x.seq))			
+			kcount=kmercount(str(x.seq),wordn)
+
+			outfile.write(str(x.description)+"\t"+str(len(x.seq))+"\t"+str(shannon)+"\t"+str(kcount[0])+"\t"+str(kcount[1])+"\n")
+
+			#print(x.id,'len=',len(x.seq),'S=',shannon,'nK=',kcount[0],'rK=',kcount[1])
+			st=st+shannon
+		else:
+			break
+
+	shanmean=float(st/c)
+
+	print(i,c,"reads",shanmean,"mean Shannons")
+
+	outfile.close()
+
+#global
+
+folder=sys.argv[1] 
+filelist=glob.glob(folder)
+filelist.sort(key=tokenize)
+print(filelist)
+outfolder = sys.argv[2] #outfolder
+wordn=int(sys.argv[3])
+sample=int(sys.argv[4])
+threads=int(sys.argv[5])
+
+if __name__ == '__main__':
+
+	#executor = concurrent.futures.ProcessPoolExecutor(threads)
+	#futures = [executor.submit(shan, i,wordn) for i in filelist]
+	#concurrent.futures.wait(futures)
+
+
+
+	for i in filelist:
+		shan(i,wordn)
+
+	#print(data)
+
+
+
+
+