Basically just fixing everything to look nicer and making use of SeqI…

…O instead of parsing it manually
ErillLab · Feb 13, 2023 · 1621736 · 1621736
1 parent 87286ec
commit 1621736
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 105 deletions.
diff --git a/class_input.py b/class_input.py
@@ -39,112 +39,104 @@ def load_json(self):
 
 
     def reg_BLAST_search(self):
-        # check the length of the input_records.
+
+        # Make sure there is at least one protein to do BLAST
         if len(self.ref_proteins) < 1:
             raise Exception("Need at least one protein record to conduct BLAST search.")
 
-        # The final list which we will add the BLAST hits to
-        hits = []
-
-        # Gets the accession numbers for all the hits in the BLAST search and adds to hits[] with every unique record.
+        # Gets protein records of the reference protein to use in future BLAST
         for ref_protein in self.ref_proteins:
             # Fetches the protein record based off the accession number
             print("Getting protein record...")
-
             for i in range(self.entrez_param["retry_number"]):
-
                 try:
                     handle = Entrez.efetch("protein", id=ref_protein["prot_id"], rettype="fasta",
                                            retmode="text")
                     time.sleep(self.entrez_param["sleep_time"])
                     break
-
                 except:
-                    print("\tNCBI exception raised on attempt " + str(i) + "\n\treattempting now...")
+                    print("\tNCBI exception raised on attempt " + str(i+1) + "\n\treattempting now...")
+                    time.sleep(self.entrez_param["sleep_time"])
                     if i == (self.entrez_param["retry_number"] - 1):
                         print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
 
-            # Fetches the protein sequence (from record) to be used in the BLAST search
-            print("Getting protein sequence...\n")
+            # Gets the protein sequence (from record) to be used in the BLAST search
+            print("Getting protein sequence from record...\n")
             input_seq = (SeqIO.read(handle, "fasta")).seq
 
             # Performs the BLAST using parameters from json file
             print("Performing BLAST search: " + str(ref_protein["prot_id"]) + ", this may take a while...")
+
+            # Uses taxon parameter in BLAST search if target_clade is given
             if self.target_clade is not None:
                 # Holds the parameter for the target clade
                 taxon = "txid" + str(self.target_clade) + "[orgn]"
-
                 for i in range(self.entrez_param["retry_number"]):
-
                     try:
-                        result_handle = NCBIWWW.qblast("blastp", "nr", input_seq,
+                        result_handle = NCBIWWW.qblast(program="blastp", database="nr", sequence=input_seq,
                                                        entrez_query=taxon, expect=self.blast_param["e_value"],
                                                        hitlist_size=self.blast_param["max_results"])
-                        # Parses the resulting hits as a list
+                        # Reads and parses the resulting hits
                         print("\tGetting records")
-                        blast_records = list(NCBIXML.parse(result_handle))
+                        blast_records = NCBIXML.read(result_handle)
                         time.sleep(self.entrez_param["sleep_time"])
                         break
-
                     except:
-                        print("\tNCBI exception raised on attempt " + str(i) + "\n\treattempting now...")
+                        print("\tNCBI exception raised on attempt " + str(i+1) + "\n\treattempting now...")
+                        time.sleep(self.entrez_param["sleep_time"])
                         if i == (self.entrez_param["retry_number"] - 1):
                             print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
 
+            # if taxon is not given, BLAST search is done without taxon
             else:
-
                 for i in range(self.entrez_param["retry_number"]):
-
                     try:
                         result_handle = NCBIWWW.qblast(program="blastp", database=self.blast_param["database"],
                                                        sequence=input_seq, expect=self.blast_param["e_value"],
                                                        hitlist_size=self.blast_param["max_results"])
                         time.sleep(self.entrez_param["sleep_time"])
-                        # Parses the resulting hits as a list
+                        # Reads and parses the resulting hits
                         print("\tGetting records")
-                        blast_records = list(NCBIXML.parse(result_handle))
+                        blast_records = NCBIXML.read(result_handle)
                         break
-
                     except:
-                        print("\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
+                        print("\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
+                        time.sleep(self.entrez_param["sleep_time"])
                         if i == (self.entrez_param["retry_number"] - 1):
                             print("\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
 
+            # Creates a list of hits to add BLAST results to
+            hits = []
             # Adds each unique accession number to hits[]
-            for record in blast_records[0].alignments:
-                curr_hit_rec = record.hit_id.split('|')[-2]
+            for record in blast_records.alignments:
+                hit_rec = record.hit_id.split('|')[-2]
                 print("\t\tAnalyzing hit " + str(curr_hit_rec))
 
                 for hit in record.hsps:
-
                     # Checks if hit meets the minimum coverage if provided
                     if self.blast_param["query_coverage"]:
                         cov = (hit.query_end - hit.query_start + 1) / (len(input_seq))
-
                         if (cov >= self.blast_param["query_coverage"]):
-
-                            # Check if the hit is already in the return list
+                            # Adds hit only if its not already in return list
                             if len(hits) == 0:
-                                print("\t\tAdding first hit (Coverage = " + str(cov) + "): " + str(curr_hit_rec))
-                                hits.append((ref_protein, curr_hit_rec))
-
-                            elif (not (curr_hit_rec in list(zip(*hits))[1])):
-                                print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(curr_hit_rec))
-                                hits.append((ref_protein, curr_hit_rec))
-
+                                print("\t\tAdding first hit (Coverage = " + str(cov) + "): " + str(hit_rec))
+                                hits.append((ref_protein, hit_rec))
+                            elif ((ref_protein, hit_rec) not in hits):
+                                print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(hit_rec))
+                                hits.append((ref_protein, hit_rec))
                         # Prints error if the minimum coverage is not met
                         else:
-                            print("\t\tHit did not meet coverage (Coverage = " + str(cov) + ") requirement: " + str(
-                                curr_hit_rec))
+                            print("\t\tHit did not meet coverage (Coverage = " + str(cov) + ") requirement: " + str(hit_rec))
+
+                    # if no minimum coverage is provided
                     else:
-                        # Check if the hit is already in the return list
+                        # Adds hit only if its not already in return list
                         if len(hits) == 0:
-                            print("\t\tAdding first hit: " + str(curr_hit_rec))
-                            hits.append((ref_protein, curr_hit_rec))
-
-                        elif (not (curr_hit_rec in list(zip(*hits))[1])):
-                            print("\t\tAdding hit: " + str(curr_hit_rec))
-                            hits.append((ref_protein, curr_hit_rec))
+                            print("\t\tAdding first hit: " + str(hit_rec))
+                            hits.append((ref_protein, hit_rec))
+                        elif ((ref_protein, hit_rec) not in hits):
+                            print("\t\tAdding hit (Coverage = " + str(cov) + "): " + str(hit_rec))
+                            hits.append((ref_protein, hit_rec))
 
         print("\tReturning " + str(len(hits)) + " unique hits\n")
         return hits
@@ -153,10 +145,8 @@ def reg_BLAST_search(self):
     def clustered_BLAST_search(self):
         save_database = self.blast_param["database"]
         self.blast_param["database"] = "clustered_nr"
-
         hits = self.reg_BLAST_search()
         self.blast_param["database"] = save_database
-
         return hits
 
 
@@ -170,26 +160,26 @@ def hierarchical_BLAST_search(self):
         desired_descendants = []
 
         # getting all descendants under taxa and putting them into list
-        for i in range(len(descendants)):
+        for descendant in descendants:
             rank = ""
-            rank_dict = NCBITaxa().get_rank([descendants[i]])
+            rank_dict = NCBITaxa().get_rank([descendant])
 
             for key in rank_dict:
                 rank = rank_dict[key]
 
             if str(rank) == (self.blast_param["hierarchical_taxon_level"]):
-                desired_descendants.append(descendants[i])
+                desired_descendants.append(descendant)
 
         # saving original parameters of the search
         target_clade = self.target_clade
 
         # BLAST search for each group in the desired descendants
         original_hits = []
-        for i in range(len(desired_descendants)):
-            self.target_clade = desired_descendants[i]
+        for descendant in (desired_descendants):
+            self.target_clade = descendant
             original_hits.append(self.reg_BLAST_search())
 
-        # make the target clade back to what it was originally (before search)
+        # make the target clade back to what it was originally before BLAST search
         self.target_clade = target_clade
 
         # deleting duplicates from list

diff --git a/class_orthologs.py b/class_orthologs.py
@@ -1,7 +1,7 @@
 # File Name: class_input.py
 # Author: Hannah Nguyen
 # Date: 9/22/22
-# Description: Input class  , in information from json file
+# Description: orthologs class
 
 from Bio import Seq, Entrez, SeqIO
 from Bio.SeqRecord import SeqRecord
@@ -26,20 +26,18 @@ def __init__(self, the_input, hit):
 
     def get_nuc_rec(self):
         print("Getting IPG nucelotide records for " + str(self.hit[1]) + "...")
-        # Fetch the protein record
         print("\tFetching the protein records...")
-
+        # Fetches protein records for each hit
         for i in range(self.entrez_param["retry_number"]):
-
             try:
-                records = Entrez.read(Entrez.efetch(db="protein", id=self.hit[1], rettype="ipg",
-                                                    retmode="xml"))
+                handle = Entrez.efetch(db="protein", id=self.hit[1], rettype="ipg",
+                                                    retmode="xml")
+                records = Entrez.read(handle)
                 time.sleep(self.entrez_param["sleep_time"])
                 break
-
             except:
                 print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
-
+                time.sleep(self.entrez_param["sleep_time"])
                 if i == (self.entrez_param["retry_number"] - 1):
                     print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
                     print("\tNo gene records found for " + str(self.hit[1]))
@@ -57,7 +55,6 @@ def get_nuc_rec(self):
 
         # Gene records are appended to this list
         genome_list = []
-
         print("\tRecording the gene records with priorities...")
         if 'ProteinList' in records['IPGReport'].keys():
             for idprotein in records['IPGReport']['ProteinList']:
@@ -102,11 +99,11 @@ def get_nuc_rec(self):
 
     def get_nuc_seq(self):
         if self.nuc_rec is None:
+            print(self.nuc_rec)
             print("\tRecord is not valid")
             return None
 
         print("Get nucleotide sequence for " + str(self.nuc_rec['acc']) + "...")
-
         # grabs upstream region some parameter base pairs away from start to check for operons
         if self.nuc_rec['strand'] == '+':
             s_start = int(self.nuc_rec['start']) - self.blast_param["base_pairs_to_grab"]
@@ -119,30 +116,28 @@ def get_nuc_seq(self):
             s_start = int(self.nuc_rec['start']) + self.blast_param["base_pairs_to_grab"]
             s_strand = 2
 
-
-        print("start " + str(s_start))
-        print("stop " + str(s_stop))
-
         print("\t\tGetting genbank record...")
 
         # Fetch and read the annotated GenBank record
         for i in range(self.entrez_param["retry_number"]):
             try:
                 handle = Entrez.efetch(db="nuccore", id=self.nuc_rec['acc'], strand=s_strand,
                                        seq_start=s_start, seq_stop=s_stop,
-                                       rettype='gbwithparts', retmode="xml")
-
-                genome_record = Entrez.read(handle, "xml")
+                                       rettype='gbwithparts', retmode="text")
                 time.sleep(self.entrez_param["sleep_time"])
+                genome_record = SeqIO.read(handle, "genbank")
                 break
-
             except:
-
                 print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
-
+                time.sleep(self.entrez_param["sleep_time"])
                 if i == (self.entrez_param["retry_number"] - 1):
                     print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
 
+        sequence = genome_record.seq
+
+        print(genome_record.features)
+
+'''
         print("\t\tParsing intervals for coding regions...")
         # Find all coding regions in the returned GenBank sequence.
         coding_intervals = []
@@ -168,7 +163,7 @@ def get_nuc_seq(self):
         #Setting up the description for the FASTA record
         return_description = "Original_Query_Protein " + str(self.hit[0]) + " BLAST_Hit_Accession " + str(self.hit[1])
 
-
+'''
         # If there is only one coding region in the selected sequence, then
         # the sequence is returned unmodified.
         if len(coding_intervals) == 1:
@@ -188,13 +183,13 @@ def get_nuc_seq(self):
         # LATER make sure to implement a way to add 10,000 more when you dont reach the end
         for i in range(len(coding_intervals)):
             if i > 0:
-                if coding_intervals[i][1] - coding_intervals[i-1][0] <= self.blast_param["min_intergenic_distance"]:
-                    print("poop ye " + str(coding_intervals[i-1][1] - coding_intervals[i][0]))
-                    if coding_intervals[i-1][1] - coding_intervals[i][0] > 0:
+                print("poop ye " + str(coding_intervals[i][0] - coding_intervals[i-1][1]))
+                if coding_intervals[i][0] - coding_intervals[i-1][1] <= self.blast_param["min_intergenic_distance"]:
+                    if coding_intervals[i][0] - coding_intervals[i-1][1] > 0:
+                        upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
+                elif coding_intervals[i][2] == coding_intervals[i-1][2]:
+                    if coding_intervals[i][1] - coding_intervals[i-1][0] > 0:
                         upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
-                elif coding_intervals[i-1][2] == coding_intervals[i][2]:
-                    if coding_intervals[i-1][1] - coding_intervals[i][0] > 0:
-                        upstream_regions.append((coding_intervals[i - 1][1], coding_intervals[i][0]))
                 else:
                     the_end = coding_intervals[i][0]
                     break

diff --git a/input.json b/input.json
@@ -1,14 +1,14 @@
 {
-	"reference_proteins" : [{"prot_id" : "WP_074541780.1", "sequence" : "mnedeiyrhi"}],
-	"target_clade" : 91347,
+	"reference_proteins" : [{"prot_id" : "WP_011079176"}],
+	"target_clade" : 135623,
 	"BLAST_parameters" : {
 			"e_value" : 10e-05,
 			"query_coverage" : 0.75,
 			"database" : "nr",
-			"max_results" : 200,
-			"min_intergenic_distance" : 150,
+			"max_results" : 10,
+			"min_intergenic_distance" : 100,
+			"base_pairs_to_grab" : 10000,
 			"hierarchical_taxon_level" : "None"
-
 	},
 	"entrez_parameters" : {
 					"email" : "[email protected]",

diff --git a/main.py b/main.py
@@ -21,36 +21,24 @@
     # loads json file
     the_input.load_json()
 
+    '''
     # do blast search (either regular, clustered, or hierarchical)
     hits = []
-    '''
     if the_input.blast_param["hierarchical_taxon_level"] != "None":
         hits = the_input.hierarchical_BLAST_search()
     else:
         hits = the_input.reg_BLAST_search()
-    '''
 
     #hits = [({'prot_id': 'WP_011079176'}, 'WP_011079176.1')]
-
     #hits = [({'prot_id': 'WP_011079176'}, 'WP_039469417.1')]
-
+    '''
     hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]
 
-    print(hits)
-
     # treats each hit from the blast search as a potential ortholog and adds to list of orthologs
     orthologs = []
-    for i in range(len(hits)):
-        orthologs.append(theOrtholog(the_input, hits[i]))
-    print()
-
-    for i in range(len(orthologs)):
-        orthologs[i].get_nuc_rec() #gets nucleotide record for each ortholog
-        print(orthologs[i].nuc_rec)
-        orthologs[i].get_nuc_seq() #get nucleotide sequence from the record for each ortholog
-
-
-
-
-
+    for hit in hits:
+        orthologs.append(theOrtholog(the_input, hit))
 
+    for potential_olog in orthologs:
+        potential_olog.get_nuc_rec() #gets nucleotide record for each ortholog
+        potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog