Skip to content

Commit

Permalink
Basically just fixing everything to look nicer and making use of SeqI…
Browse files Browse the repository at this point in the history
…O instead of parsing it manually
  • Loading branch information
hannahn281 committed Feb 13, 2023
1 parent 87286ec commit 1621736
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 105 deletions.
98 changes: 44 additions & 54 deletions class_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,112 +39,104 @@ def load_json(self):


def reg_BLAST_search(self):
# check the length of the input_records.

# Make sure there is at least one protein to do BLAST
if len(self.ref_proteins) < 1:
raise Exception("Need at least one protein record to conduct BLAST search.")

# The final list which we will add the BLAST hits to
hits = []

# Gets the accession numbers for all the hits in the BLAST search and adds to hits[] with every unique record.
# Gets protein records of the reference protein to use in future BLAST
for ref_protein in self.ref_proteins:
# Fetches the protein record based off the accession number
print("Getting protein record...")

for i in range(self.entrez_param["retry_number"]):

try:
handle = Entrez.efetch("protein", id=ref_protein["prot_id"], rettype="fasta",
retmode="text")
time.sleep(self.entrez_param["sleep_time"])
break

except:
print("\tNCBI exception raised on attempt " + str(i) + "\n\treattempting now...")
print("\tNCBI exception raised on attempt " + str(i+1) + "\n\treattempting now...")
time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")

# Fetches the protein sequence (from record) to be used in the BLAST search
print("Getting protein sequence...\n")
# Gets the protein sequence (from record) to be used in the BLAST search
print("Getting protein sequence from record...\n")
input_seq = (SeqIO.read(handle, "fasta")).seq

# Performs the BLAST using parameters from json file
print("Performing BLAST search: " + str(ref_protein["prot_id"]) + ", this may take a while...")

# Uses taxon parameter in BLAST search if target_clade is given
if self.target_clade is not None:
# Holds the parameter for the target clade
taxon = "txid" + str(self.target_clade) + "[orgn]"

for i in range(self.entrez_param["retry_number"]):

try:
result_handle = NCBIWWW.qblast("blastp", "nr", input_seq,
result_handle = NCBIWWW.qblast(program="blastp", database="nr", sequence=input_seq,
entrez_query=taxon, expect=self.blast_param["e_value"],
hitlist_size=self.blast_param["max_results"])
# Parses the resulting hits as a list
# Reads and parses the resulting hits
print("\tGetting records")
blast_records = list(NCBIXML.parse(result_handle))
blast_records = NCBIXML.read(result_handle)
time.sleep(self.entrez_param["sleep_time"])
break

except:
print("\tNCBI exception raised on attempt " + str(i) + "\n\treattempting now...")
print("\tNCBI exception raised on attempt " + str(i+1) + "\n\treattempting now...")
time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")

# if taxon is not given, BLAST search is done without taxon
else:

for i in range(self.entrez_param["retry_number"]):

try:
result_handle = NCBIWWW.qblast(program="blastp", database=self.blast_param["database"],
sequence=input_seq, expect=self.blast_param["e_value"],
hitlist_size=self.blast_param["max_results"])
time.sleep(self.entrez_param["sleep_time"])
# Parses the resulting hits as a list
# Reads and parses the resulting hits
print("\tGetting records")
blast_records = list(NCBIXML.parse(result_handle))
blast_records = NCBIXML.read(result_handle)
break

except:
print("\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
print("\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")

# Creates a list of hits to add BLAST results to
hits = []
# Adds each unique accession number to hits[]
for record in blast_records[0].alignments:
curr_hit_rec = record.hit_id.split('|')[-2]
for record in blast_records.alignments:
hit_rec = record.hit_id.split('|')[-2]
print("\t\tAnalyzing hit " + str(curr_hit_rec))

for hit in record.hsps:

# Checks if hit meets the minimum coverage if provided
if self.blast_param["query_coverage"]:
cov = (hit.query_end - hit.query_start + 1) / (len(input_seq))

if (cov >= self.blast_param["query_coverage"]):

# Check if the hit is already in the return list
# Adds hit only if its not already in return list
if len(hits) == 0:
print("\t\tAdding first hit (Coverage = " + str(cov) + "): " + str(curr_hit_rec))
hits.append((ref_protein, curr_hit_rec))

elif (not (curr_hit_rec in list(zip(*hits))[1])):
print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(curr_hit_rec))
hits.append((ref_protein, curr_hit_rec))

print("\t\tAdding first hit (Coverage = " + str(cov) + "): " + str(hit_rec))
hits.append((ref_protein, hit_rec))
elif ((ref_protein, hit_rec) not in hits):
print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(hit_rec))
hits.append((ref_protein, hit_rec))
# Prints error if the minimum coverage is not met
else:
print("\t\tHit did not meet coverage (Coverage = " + str(cov) + ") requirement: " + str(
curr_hit_rec))
print("\t\tHit did not meet coverage (Coverage = " + str(cov) + ") requirement: " + str(hit_rec))

# if no minimum coverage is provided
else:
# Check if the hit is already in the return list
# Adds hit only if its not already in return list
if len(hits) == 0:
print("\t\tAdding first hit: " + str(curr_hit_rec))
hits.append((ref_protein, curr_hit_rec))

elif (not (curr_hit_rec in list(zip(*hits))[1])):
print("\t\tAdding hit: " + str(curr_hit_rec))
hits.append((ref_protein, curr_hit_rec))
print("\t\tAdding first hit: " + str(hit_rec))
hits.append((ref_protein, hit_rec))
elif ((ref_protein, hit_rec) not in hits):
print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(hit_rec))
hits.append((ref_protein, hit_rec))

print("\tReturning " + str(len(hits)) + " unique hits\n")
return hits
Expand All @@ -153,10 +145,8 @@ def reg_BLAST_search(self):
def clustered_BLAST_search(self):
save_database = self.blast_param["database"]
self.blast_param["database"] = "clustered_nr"

hits = self.reg_BLAST_search()
self.blast_param["database"] = save_database

return hits


Expand All @@ -170,26 +160,26 @@ def hierarchical_BLAST_search(self):
desired_descendants = []

# getting all descendants under taxa and putting them into list
for i in range(len(descendants)):
for descendant in descendants:
rank = ""
rank_dict = NCBITaxa().get_rank([descendants[i]])
rank_dict = NCBITaxa().get_rank([descendant])

for key in rank_dict:
rank = rank_dict[key]

if str(rank) == (self.blast_param["hierarchical_taxon_level"]):
desired_descendants.append(descendants[i])
desired_descendants.append(descendant)

# saving original parameters of the search
target_clade = self.target_clade

# BLAST search for each group in the desired descendants
original_hits = []
for i in range(len(desired_descendants)):
self.target_clade = desired_descendants[i]
for descendant in (desired_descendants):
self.target_clade = descendant
original_hits.append(self.reg_BLAST_search())

# make the target clade back to what it was originally (before search)
# make the target clade back to what it was originally before BLAST search
self.target_clade = target_clade

# deleting duplicates from list
Expand Down
49 changes: 22 additions & 27 deletions class_orthologs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# File Name: class_input.py
# Author: Hannah Nguyen
# Date: 9/22/22
# Description: Input class , in information from json file
# Description: orthologs class

from Bio import Seq, Entrez, SeqIO
from Bio.SeqRecord import SeqRecord
Expand All @@ -26,20 +26,18 @@ def __init__(self, the_input, hit):

def get_nuc_rec(self):
print("Getting IPG nucelotide records for " + str(self.hit[1]) + "...")
# Fetch the protein record
print("\tFetching the protein records...")

# Fetches protein records for each hit
for i in range(self.entrez_param["retry_number"]):

try:
records = Entrez.read(Entrez.efetch(db="protein", id=self.hit[1], rettype="ipg",
retmode="xml"))
handle = Entrez.efetch(db="protein", id=self.hit[1], rettype="ipg",
retmode="xml")
records = Entrez.read(handle)
time.sleep(self.entrez_param["sleep_time"])
break

except:
print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")

time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
print("\tNo gene records found for " + str(self.hit[1]))
Expand All @@ -57,7 +55,6 @@ def get_nuc_rec(self):

# Gene records are appended to this list
genome_list = []

print("\tRecording the gene records with priorities...")
if 'ProteinList' in records['IPGReport'].keys():
for idprotein in records['IPGReport']['ProteinList']:
Expand Down Expand Up @@ -102,11 +99,11 @@ def get_nuc_rec(self):

def get_nuc_seq(self):
if self.nuc_rec is None:
print(self.nuc_rec)
print("\tRecord is not valid")
return None

print("Get nucleotide sequence for " + str(self.nuc_rec['acc']) + "...")

# grabs upstream region some parameter base pairs away from start to check for operons
if self.nuc_rec['strand'] == '+':
s_start = int(self.nuc_rec['start']) - self.blast_param["base_pairs_to_grab"]
Expand All @@ -119,30 +116,28 @@ def get_nuc_seq(self):
s_start = int(self.nuc_rec['start']) + self.blast_param["base_pairs_to_grab"]
s_strand = 2


print("start " + str(s_start))
print("stop " + str(s_stop))

print("\t\tGetting genbank record...")

# Fetch and read the annotated GenBank record
for i in range(self.entrez_param["retry_number"]):
try:
handle = Entrez.efetch(db="nuccore", id=self.nuc_rec['acc'], strand=s_strand,
seq_start=s_start, seq_stop=s_stop,
rettype='gbwithparts', retmode="xml")

genome_record = Entrez.read(handle, "xml")
rettype='gbwithparts', retmode="text")
time.sleep(self.entrez_param["sleep_time"])
genome_record = SeqIO.read(handle, "genbank")
break

except:

print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")

time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")

sequence = genome_record.seq

print(genome_record.features)

'''
print("\t\tParsing intervals for coding regions...")
# Find all coding regions in the returned GenBank sequence.
coding_intervals = []
Expand All @@ -168,7 +163,7 @@ def get_nuc_seq(self):
#Setting up the description for the FASTA record
return_description = "Original_Query_Protein " + str(self.hit[0]) + " BLAST_Hit_Accession " + str(self.hit[1])

'''
# If there is only one coding region in the selected sequence, then
# the sequence is returned unmodified.
if len(coding_intervals) == 1:
Expand All @@ -188,13 +183,13 @@ def get_nuc_seq(self):
# LATER make sure to implement a way to add 10,000 more when you dont reach the end
for i in range(len(coding_intervals)):
if i > 0:
if coding_intervals[i][1] - coding_intervals[i-1][0] <= self.blast_param["min_intergenic_distance"]:
print("poop ye " + str(coding_intervals[i-1][1] - coding_intervals[i][0]))
if coding_intervals[i-1][1] - coding_intervals[i][0] > 0:
print("poop ye " + str(coding_intervals[i][0] - coding_intervals[i-1][1]))
if coding_intervals[i][0] - coding_intervals[i-1][1] <= self.blast_param["min_intergenic_distance"]:
if coding_intervals[i][0] - coding_intervals[i-1][1] > 0:
upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
elif coding_intervals[i][2] == coding_intervals[i-1][2]:
if coding_intervals[i][1] - coding_intervals[i-1][0] > 0:
upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
elif coding_intervals[i-1][2] == coding_intervals[i][2]:
if coding_intervals[i-1][1] - coding_intervals[i][0] > 0:
upstream_regions.append((coding_intervals[i - 1][1], coding_intervals[i][0]))
else:
the_end = coding_intervals[i][0]
break
Expand Down
10 changes: 5 additions & 5 deletions input.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
"reference_proteins" : [{"prot_id" : "WP_074541780.1", "sequence" : "mnedeiyrhi"}],
"target_clade" : 91347,
"reference_proteins" : [{"prot_id" : "WP_011079176"}],
"target_clade" : 135623,
"BLAST_parameters" : {
"e_value" : 10e-05,
"query_coverage" : 0.75,
"database" : "nr",
"max_results" : 200,
"min_intergenic_distance" : 150,
"max_results" : 10,
"min_intergenic_distance" : 100,
"base_pairs_to_grab" : 10000,
"hierarchical_taxon_level" : "None"

},
"entrez_parameters" : {
"email" : "[email protected]",
Expand Down
26 changes: 7 additions & 19 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,36 +21,24 @@
# loads json file
the_input.load_json()

'''
# do blast search (either regular, clustered, or hierarchical)
hits = []
'''
if the_input.blast_param["hierarchical_taxon_level"] != "None":
hits = the_input.hierarchical_BLAST_search()
else:
hits = the_input.reg_BLAST_search()
'''
#hits = [({'prot_id': 'WP_011079176'}, 'WP_011079176.1')]

#hits = [({'prot_id': 'WP_011079176'}, 'WP_039469417.1')]

'''
hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]

print(hits)

# treats each hit from the blast search as a potential ortholog and adds to list of orthologs
orthologs = []
for i in range(len(hits)):
orthologs.append(theOrtholog(the_input, hits[i]))
print()

for i in range(len(orthologs)):
orthologs[i].get_nuc_rec() #gets nucleotide record for each ortholog
print(orthologs[i].nuc_rec)
orthologs[i].get_nuc_seq() #get nucleotide sequence from the record for each ortholog





for hit in hits:
orthologs.append(theOrtholog(the_input, hit))

for potential_olog in orthologs:
potential_olog.get_nuc_rec() #gets nucleotide record for each ortholog
potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog

0 comments on commit 1621736

Please sign in to comment.