From 34a2bc425e050b275963129462248b4dda11871d Mon Sep 17 00:00:00 2001 From: pedrogs99 <44211126+pedrogs99@users.noreply.github.com> Date: Sun, 9 Jan 2022 19:30:40 +0100 Subject: [PATCH] Add files via upload --- main.py | 374 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 374 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..9de4a26 --- /dev/null +++ b/main.py @@ -0,0 +1,374 @@ +from contextlib import nullcontext +import os +from os import listdir +from ete3 import NCBITaxa +from math import log2 +import json +from Bio import SeqIO +from Bio import motifs +from Bio import Entrez +import csv +from tqdm import trange +ncbi=NCBITaxa() +#ncbi.update_taxonomy_database() +Entrez.email = 'pedro.garciasu@autonoma.cat' + +class HitResult: + def __init__(self, gen_ID,organism, tf_ID, location, score, dist_to_gene, type, gene): + self.tf_ID=tf_ID + self.organism=organism + self.genome_ID=gen_ID + self.score=score + self.location=location + if gene: + if 'locus_tag' in gene.qualifiers.keys(): + self.gene_locus_tag=gene.qualifiers['locus_tag'] + elif 'note' in gene.qualifiers.keys(): + self.gene_locus_tag=gene.qualifiers['note'] + else: + self.gene_locus_tag='undefined' + + if 'product' in gene.qualifiers.keys(): + self.gene_product=gene.qualifiers['product'] + else: + self.gene_product='undefined' + + self.gene_start=gene.location.start + self.gene_stop=gene.location.end + self.gene_strand=gene.strand + self.dist_to_gene=dist_to_gene + self.type=type + else: + self.gene_product='undefined' + self.gene_locus_tag='undefined' + self.gene_start='undefined' + self.gene_stop='undefined' + self.gene_strand='undefined' + self.dist_to_gene='undefined' + self.type='undefined' + + + + +def readJSON(filename): + f=open(filename) + print(filename) + json_obj=json.load(f) + out_path=json_obj['Output_path'] + gnome_path=json_obj['genomes_path'] + motifs_path=json_obj['motifs_path'] + motifs_file=json_obj['TF_filename'] + gen_file=json_obj['Genome_filename'] + clade=json_obj['clade'] + up_dist=json_obj['upstream_dist'] + down_dist=json_obj['downstream_dist'] + pseudo=json_obj['pseudocounts'] + return out_path,motifs_path, gnome_path, motifs_file,gen_file, clade, up_dist, down_dist,pseudo + +def readTF(filename, pseudo): + motifs_parsed = SeqIO.parse(filename,"fasta") + + + #------Convert into motif object--------- + + list_seq=[] + for record in motifs_parsed: + list_seq.append(record.seq.upper()) + motif=motifs.create(list_seq) + tf_id=filename + + #------PSSM---------- + pwm=motif.counts.normalize(pseudocounts=pseudo) + pssm=pwm.log_odds() + reverse_pssm=pssm.reverse_complement() + distribution = pssm.distribution(background=None,precision=10**4) + threshold=distribution.threshold_patser() + + return motif, threshold, pssm, reverse_pssm + +def readGB(filename): + gb_object = SeqIO.read(filename,'genbank') + seq=gb_object.seq + organism = gb_object.annotations['organism'] + gen_ID=gb_object.name + + return gb_object,seq, organism, gen_ID + +def passThreshold(seq,motif,pssm,reverse_pssm,threshold): + + scores={} + sites={} + #---------Obtain the sites that pass the threshold using combined scores---------- + hits=[i for i in trange(len(seq)-len(motif),desc='Finding Hits') if log2(2**pssm.calculate(seq[i:i+len(motif)])+2**reverse_pssm.calculate(seq[i:i+len(motif)]))>=threshold] + #hits=pssm.search(pssm.calculate(seq),threshold) + if hits: + for i in hits: + scores[i]=log2(2**pssm.calculate(seq[i:i+len(motif)])+2**reverse_pssm.calculate(seq[i:i+len(motif)])) + sites[i]=seq[i:i+len(motif)] + #for i in trange(len(seq)-len(motif),desc='Finding Hits'): + #site=seq[i:i+len(motif)] + #score=pssm.calculate(seq[i:i+len(motif)]) + #reverse_score=reverse_pssm.calculate(seq[i:i+len(motif)]) + #final_score=math.log2(2**pssm.calculate(seq[i:i+len(motif)])+2**reverse_pssm.calculate(seq[i:i+len(motif)])) + #if math.log2(2**pssm.calculate(seq[i:i+len(motif)])+2**reverse_pssm.calculate(seq[i:i+len(motif)]))>=threshold: + #scores[i]=math.log2(2**pssm.calculate(seq[i:i+len(motif)])+2**reverse_pssm.calculate(seq[i:i+len(motif)])) + #hits.append(i) + #sites[i]=seq[i:i+len(motif)] + + return hits,scores + +def check_overlap(hits,motif,scores): + if hits: + to_delete=[] + for i in range(len(hits)-1): + if (hits[i]+len(motif)/2)>=hits[i+1]: + delete1=hits[i] + delete2=hits[i+1] + if scores[delete1]>=scores[delete2]: + to_delete.append(i+1) + #hits.pop(i) + #del scores[delete2] + else: + to_delete.append(i) + #hits.pop(i+1) + #del scores[delete1] + for i in sorted(to_delete,reverse=True): + del scores[hits[i]] + del hits[i] + + + + + return hits, scores + +def dist_classify(hits, seq, gb_object, up_dist, down_dist): + dist_to_genes={} + gene_locations={} + genes={} + classification={} + + for pos in hits: + dist_final=len(seq) + + #-----Calculate distance to closest gene---------- + for f in gb_object.features: + if f.type=='CDS': + if f.strand is 1: + dist1=pos-f.location.start + elif f.strand is -1: + dist1=pos-f.location.end + #dist2=pos-f.location.start + dist_aux = abs(dist1) + if dist_auxdist_final: + #break + + #------Classification of hit by their relative location to genes + if genes: + if genes[pos].strand is 1: + if dist_to_genes[pos]>=-(up_dist) and dist_to_genes[pos]<=down_dist: + classification[pos]="OPERATOR" + elif dist_to_genes[pos]>down_dist and dist_to_genes[pos]gene_locations[pos].end: + classification[pos]="INTERGENETIC" + elif genes[pos].strand is -1: + if dist_to_genes[pos] <= up_dist and dist_to_genes[pos] >= -(down_dist): + classification[pos]="OPERATOR" + elif dist_to_genes[pos] < -(down_dist) and dist_to_genes[pos] > gene_locations[pos].start: + classification[pos]="INTRAGENETIC" + elif dist_to_genes[pos] > up_dist or dist_to_genes[pos] < gene_locations[pos].start: + classification[pos]="INTERGENETIC" + + return dist_to_genes,genes,classification + +def save_to_csv(results, path): + header=[ + "Motif", + "Genome", + "Genome ID", + "Score", + "Abs. Position", + "Gene locus_tag", + "Gene Product", + "Gene start", + "Gene stop", + "Gene Strand", + "Position relative to gene", + "Region" + ] + filepath=os.path.join(path, "results.csv") + with open(filepath,'a',newline='') as f: + f.truncate(0) + writer=csv.DictWriter(f,fieldnames=header, delimiter=',') + writer.writeheader() + for motif_dict in results.items(): + for genome_results in motif_dict[1].items(): + for hit in genome_results[1]: + row={ + 'Motif':hit.tf_ID, + 'Genome': hit.organism, + 'Genome ID':hit.genome_ID, + 'Score':hit.score, + 'Abs. Position':hit.location, + 'Gene locus_tag':hit.gene_locus_tag, + 'Gene Product':hit.gene_product, + 'Gene start':hit.gene_start, + 'Gene stop':hit.gene_stop, + 'Gene Strand':hit.gene_strand, + 'Position relative to gene':hit.dist_to_gene, + 'Region':hit.type + } + writer.writerow(row) + + + +def main(): + for motif_file in tf_filenames: + print('=====MOTIF FILE: '+ motif_file,' ================' ) + motif_filepath = os.path.join(motifs_path, motif_file) + motif, threshold, pssm, reverse_pssm=readTF(motif_filepath,pseudo) + result_dic={} + #gen_filenames=["MW546072.gb","KJ206559.gb","JX080301.gb"] + i=0 + organisms=[] + for gnome_file in gen_filenames: + i=i+1 + print('=====GEN FILE: '+ gnome_file,' ================' ) + print('File ',i,'/',len(gen_filenames)) + gnome_filepath = os.path.join(gnome_path, gnome_file) + #--------Read gb file----------- + gb_object,seq,organism,gen_ID=readGB(gnome_filepath) + + + if organism not in organisms: + organisms.append(organism) + print("LENGTH IN BP: ", len(seq)) + #==========PROCESSING================# + hits,scores=passThreshold(seq,motif,pssm,reverse_pssm,threshold) + + + #---------Check for overlapping------------------- + hits, scores=check_overlap(hits,motif,scores) + + print("Hits found: ",len(hits)) + + #---------Classification of sites by relative position to genes-- + dist_to_genes,genes,classification=dist_classify(hits, seq, gb_object,up_dist,down_dist) + + + #===============RESULTS=====================# + + #----------Saving results in list of HitResult objects-------------- + results=[] + for hit in hits: + if genes: + result=HitResult(gen_ID,organism,motif_file,hit,scores[hit],dist_to_genes[hit],classification[hit],genes[hit]) + else: + result=HitResult(gen_ID,organism,motif_file,hit,scores[hit],dist_to_genes,classification,genes) + results.append(result) + + result_dic[gnome_file]=results + #if i==10: + #break + + result_dic_GLOBAL[motif_file]=result_dic + save_to_csv(result_dic_GLOBAL,out_path) + + + + +def get_genomes(): + + + #===========Match hosts with descendants of clade=============== + + #get descendants of clade + descendants = ncbi.get_descendant_taxa(clade,intermediate_nodes=True,rank_limit='genus') + + print("===================================================") + descendants = ncbi.translate_to_names(descendants) + + for i in range(len(descendants)): + if 'Candidatus' in descendants[i]: + descendants[i]=descendants[i].replace('Candidatus ', '') + + input_filepath = os.path.join(gnome_path, gen_file) + + #open tsv with accessions + tsvfile = open(input_filepath, newline='') + phages_hosts_df = list(csv.reader(tsvfile, delimiter='\t')) + + fields=phages_hosts_df[0] + for i in range(len(fields)): + if fields[i]=='Host': + host_column=i + break + else: + i+=1 + + print("===================================================") + #find descendants in hosts column and save accessions for each match + accessions=[] + descriptions=[] + for row in phages_hosts_df: + #print(row[host_column]) + if row[host_column] in descendants: + accessions.append(row[0]) + descriptions.append(row[1]) + + + + #print(accessions) + #print(descriptions) + + #===============DOWNLOAD GENOMES============ + + dir_contents=listdir(gnome_path) + filenames=[] + #print(dir_contents) + for i in range(len(accessions)): + filenames.append(accessions[i]+'.gb') + print("Processing: "+accessions[i]) + if not ((accessions[i]+'.gb') in dir_contents): + print('--> Downloading: '+accessions[i]) + net_handle = Entrez.efetch(db='nuccore', id=accessions[i],rettype='gbwithparts',retmode='txt') + genome_record=net_handle.read() + out_handle=open(gnome_path+'/'+accessions[i]+".gb","w") + out_handle.write(genome_record) + + return filenames + +def get_motifs(): + input_filepath = os.path.join(motifs_path, motifs_file) + csvfile = open(input_filepath, newline='') + tf_motifs = list(csv.reader(csvfile, delimiter=',')) + + fields=tf_motifs[0] + for i in range(len(fields)): + if fields[i]=='Motif_file': + filename_column=i + break + else: + i+=1 + + filenames=[] + for row in tf_motifs: + if row[filename_column] != 'Motif_file': + filenames.append(row[filename_column]) + return filenames + + +json_filename='jsonExample.json' +out_path,motifs_path, gnome_path, motifs_file,gen_file, clade, up_dist, down_dist,pseudo=readJSON(json_filename) +gen_filenames=get_genomes() +tf_filenames=get_motifs() + +result_dic_GLOBAL={} +print("=====FINDING MATCHES========") +main() +print("==========FINISHED======") \ No newline at end of file