-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#outputs Shannon's and kmer count entropy measures for DNA sequence file or files #usage: #kmercount-byread.py "input-folder/*" output-folder/ <kmerlen> <threads> #kmercount-byread.py "mega353.fasta" ./ 8 1 #kmercount-byread.py "reads/*.fasta" kmercounts/ 8 12 #kmercount-byread.py "reads/*.fastq.gz" kmercounts/ 4 12
- Loading branch information
1 parent
857d5c1
commit 6b88b56
Showing
1 changed file
with
165 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
#!/usr/bin/env python3 | ||
|
||
#theo allnutt 2021 | ||
#outputs Shannon's and kmer count entropy measures for DNA sequence file or files | ||
#usage: | ||
#kmercount-byread.py "input-folder/*" output-folder/ <kmerlen> <threads> | ||
#kmercount-byread.py "mega353.fasta" ./ 8 1 | ||
#kmercount-byread.py "reads/*.fasta" kmercounts/ 8 12 | ||
#kmercount-byread.py "reads/*.fastq.gz" kmercounts/ 4 12 | ||
|
||
|
||
import sys | ||
import os | ||
import re | ||
import glob | ||
import subprocess as sp | ||
import concurrent.futures | ||
from Bio import SeqIO | ||
import gzip | ||
import math | ||
import collections | ||
|
||
|
||
|
||
|
||
|
||
def tokenize(filename): | ||
digits = re.compile(r'(\d+)') | ||
return tuple(int(token) if match else token for token, match in ((fragment, digits.search(fragment)) for fragment in digits.split(filename))) | ||
|
||
def estimate_shannon_entropy(dna_sequence): | ||
|
||
seq=dna_sequence.upper() | ||
if len(seq)-seq.count("N") <=0: | ||
a=0 | ||
t=0 | ||
c=0 | ||
g=0 | ||
else: | ||
a=float(seq.count("A")/(len(seq)-seq.count("N"))) | ||
t=float(seq.count("T")/(len(seq)-seq.count("N"))) | ||
c=float(seq.count("C")/(len(seq)-seq.count("N"))) | ||
g=float(seq.count("G")/(len(seq)-seq.count("N"))) | ||
|
||
if a!=0: | ||
a1=a * math.log(a,4) | ||
else: | ||
a1=0 | ||
if t!=0: | ||
t1=t * math.log(t,4) | ||
else: | ||
t1=0 | ||
if c!=0: | ||
c1=c * math.log(c,4) | ||
else: | ||
c1=0 | ||
if g!=0: | ||
g1=g * math.log(g,4) | ||
else: | ||
g1=0 | ||
|
||
|
||
entropy_value = -1 * ( a1 + t1 + c1 + g1 ) | ||
|
||
#print(a,t,c,g, entropy_value) | ||
|
||
return entropy_value | ||
|
||
def kmercount(dna_sequence,wordsize): | ||
|
||
wordsizeint=int(wordsize) | ||
kmers=[] | ||
seqlen=float(len(dna_sequence)) | ||
wordsize=float(wordsize) | ||
|
||
|
||
|
||
posscount=float((4**wordsize)*(1-math.exp(-(1/(4**wordsize))*seqlen))) | ||
#print(posscount) | ||
for x in range(len(dna_sequence)-wordsizeint): | ||
|
||
kbit=dna_sequence[x:x+wordsizeint] | ||
if "N" not in kbit: | ||
|
||
#if kbit not in kmers: | ||
#print(x,kbit) | ||
kmers.append(kbit) #also try if to test before set see which is faster: | ||
|
||
#if len(kmers)>=posscount: | ||
#break | ||
kmers=set(kmers) | ||
|
||
kratio=float(len(kmers)/posscount) | ||
#kratio is ratio of kmer count to maximum possilbe count for the sequence length | ||
return (len(kmers),kratio) | ||
|
||
|
||
def shan(i,wordn): | ||
|
||
name1=i.split("/")[-1] | ||
print(name1) | ||
ext1=name1.split(".")[-1] | ||
|
||
if ext1=="gz": | ||
#print("gz detected") | ||
f=gzip.open(i,'rt') | ||
k=name1.split(".")[-2] | ||
name1=name1[:-3] | ||
else: | ||
f=open(i,'r') | ||
k=ext1 | ||
|
||
if k[-1]=="a": | ||
fmt="fasta" | ||
#print("fasta") | ||
if k[-1]=="q": | ||
fmt="fastq" | ||
#print("fastq") | ||
c=0 | ||
|
||
st=0 | ||
outfile=open(outfolder+"/"+name1+".shan",'w') | ||
outfile.write("name\tbp\tShannon's\tkmer_count\tkmer_ratio\n") | ||
for x in SeqIO.parse(f,fmt): | ||
c=c+1 | ||
|
||
shannon = estimate_shannon_entropy(str(x.seq)) | ||
kcount=kmercount(str(x.seq),wordn) | ||
outfile.write(str(x.description)+"\t"+str(len(x.seq))+"\t"+str(shannon)+"\t"+str(kcount[0])+"\t"+str(kcount[1])+"\n") | ||
print(x.id,'len=',len(x.seq),'S=',shannon,'nK=',kcount[0],'rK=',kcount[1]) | ||
st=st+shannon | ||
shanmean=float(st/c) | ||
|
||
print(i,c,"reads",shanmean,"mean Shannons") | ||
|
||
outfile.close() | ||
|
||
|
||
|
||
folder=sys.argv[1] | ||
filelist=glob.glob(folder) | ||
filelist.sort(key=tokenize) | ||
print(filelist) | ||
outfolder = sys.argv[2] #outfolder | ||
wordn=int(sys.argv[3]) | ||
threads=int(sys.argv[4]) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
executor = concurrent.futures.ProcessPoolExecutor(threads) | ||
futures = [executor.submit(shan, i,wordn) for i in filelist] | ||
concurrent.futures.wait(futures) | ||
|
||
|
||
|
||
#for i in filelist: | ||
#shan(i,wordn) | ||
|
||
#print(data) | ||
|
||
|
||
|
||
|
||
|