changed to the new documentation and changed some stuff around to mak…

…e it look nicer
ErillLab · Mar 16, 2023 · 2d1d9b9 · 2d1d9b9
1 parent 1621736
commit 2d1d9b9
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 51 deletions.
diff --git a/align_test.py b/align_test.py
@@ -0,0 +1,28 @@
+from Bio import Align
+
+def align_sequence():
+    aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1)
+    alignments = aligner.align("GTACACACTC", "ATACAATATAT")
+    alignment = alignments[0]
+    matches = alignment.counts().identities
+    mismatches = alignment.counts().mismatches
+    gaps = alignment.counts().gaps
+    print(alignment)
+    print(matches)
+    print(mismatches)
+    print(gaps)
+
+    '''
+    aligner2 = Align.PairwiseAligner()
+    aligner2.match_score = 0
+    aligner2.mismatch_score = 1
+    aligner2.gap_score = 0
+    alignments = aligner2.align("GTACACACTC", "ATACAATATAT")
+    alignment = alignments[0]
+    print(alignment)
+    print(alignment.score)
+    '''
+
+
+if __name__ == "__main__":
+    align_sequence()
diff --git a/class_orthologs.py b/class_orthologs.py
@@ -3,9 +3,8 @@
 # Date: 9/22/22
 # Description: orthologs class
 
-from Bio import Seq, Entrez, SeqIO
+from Bio import Seq, Entrez, SeqIO, Align
 from Bio.SeqRecord import SeqRecord
-from Bio.Blast import NCBIWWW, NCBIXML
 import time
 
 class theOrtholog:
@@ -36,7 +35,7 @@ def get_nuc_rec(self):
                 time.sleep(self.entrez_param["sleep_time"])
                 break
             except:
-                print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
+                print("\t\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
                 time.sleep(self.entrez_param["sleep_time"])
                 if i == (self.entrez_param["retry_number"] - 1):
                     print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
@@ -87,6 +86,9 @@ def get_nuc_rec(self):
                 if genome["p_score"] > max_p_record["p_score"]:
                     max_p_record = genome
 
+            print("Record Name: " + max_p_record['acc'])
+            print("Original Region: " + str(max_p_record['start']) + ", " + str(max_p_record['stop']))
+
             print("\tReturning gene record for " + self.hit[1] + ". p-score: " +
                   str(max_p_record["p_score"]))
             self.nuc_rec = max_p_record
@@ -109,13 +111,14 @@ def get_nuc_seq(self):
             s_start = int(self.nuc_rec['start']) - self.blast_param["base_pairs_to_grab"]
             if s_start <= 0:
                 s_start = 1
-            s_stop = int(self.nuc_rec['stop'])
+            s_stop = int(self.nuc_rec['start'])
             s_strand = 1
         else:
-            s_stop = int(self.nuc_rec['stop'])
-            s_start = int(self.nuc_rec['start']) + self.blast_param["base_pairs_to_grab"]
+            s_stop = int(self.nuc_rec['stop']) + self.blast_param["base_pairs_to_grab"]
+            s_start = int(self.nuc_rec['stop'])
             s_strand = 2
 
+        print("Region after going upstream: " + str(s_start) + ", " + str(s_stop))
         print("\t\tGetting genbank record...")
 
         # Fetch and read the annotated GenBank record
@@ -128,78 +131,99 @@ def get_nuc_seq(self):
                 genome_record = SeqIO.read(handle, "genbank")
                 break
             except:
-                print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
+                print("\t\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
                 time.sleep(self.entrez_param["sleep_time"])
                 if i == (self.entrez_param["retry_number"] - 1):
                     print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
 
         sequence = genome_record.seq
 
-        print(genome_record.features)
-
-'''
-        print("\t\tParsing intervals for coding regions...")
-        # Find all coding regions in the returned GenBank sequence.
-        coding_intervals = []
-
-        sequence = genome_record[0]['GBSeq_sequence']
-
-        for feature in genome_record[0]['GBSeq_feature-table']:
-            if feature['GBFeature_key'] == 'gene':
-                if feature['GBFeature_location'].startswith("complement"):
-                    strand = 2
-                else:
-                    strand = 1
-                if "GBInterval_from" in feature['GBFeature_intervals'][0]:
-                    coding_start = feature['GBFeature_intervals'][0]['GBInterval_from']
-                    coding_end = feature['GBFeature_intervals'][0]['GBInterval_to']
-                    coding_intervals.append((int(coding_start), int(coding_end), strand))
-
-        print(coding_intervals)
+        #print(genome_record.features)
 
         # setting up return id
         return_id = "p_" + str(self.nuc_rec['acc'])
 
         #Setting up the description for the FASTA record
         return_description = "Original_Query_Protein " + str(self.hit[0]) + " BLAST_Hit_Accession " + str(self.hit[1])
 
-'''
         # If there is only one coding region in the selected sequence, then
         # the sequence is returned unmodified.
-        if len(coding_intervals) == 1:
+        gb_features = genome_record.features
+        if len(gb_features) == 1:
             # Appends information to record description
             print("\t\tOnly one coding regions found. Returning full sequence...")
             return SeqRecord(Seq.Seq(sequence), id=return_id, description=return_description)
 
         # If no coding intervals are indentified, None is returned.
-        elif len(coding_intervals) == 0:
+        elif len(gb_features) == 0:
             print("\t\t|~> No coding intervals found for record: " + str(self.nuc_rec['acc']) + ".")
             return None
 
-        the_end = 0 #for now
-        upstream_regions = []
-        upstream_sequences = []
+        genes = []
+        for i in range(len(gb_features)):
+            if gb_features[i].type == 'gene':
+                genes.append(gb_features[i])
 
-        # LATER make sure to implement a way to add 10,000 more when you dont reach the end
-        for i in range(len(coding_intervals)):
-            if i > 0:
-                print("poop ye " + str(coding_intervals[i][0] - coding_intervals[i-1][1]))
-                if coding_intervals[i][0] - coding_intervals[i-1][1] <= self.blast_param["min_intergenic_distance"]:
-                    if coding_intervals[i][0] - coding_intervals[i-1][1] > 0:
-                        upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
-                elif coding_intervals[i][2] == coding_intervals[i-1][2]:
-                    if coding_intervals[i][1] - coding_intervals[i-1][0] > 0:
-                        upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
+        upstream_regions = []
+        if self.nuc_rec['strand'] == '+':
+            # starts at end of list of features since we are going upstream
+            gene = len(genes) - 1
+            # LATER make sure to implement a way to add 10,000 more when you dont reach the end
+            while gene > 0:
+                print("here")
+                if len(upstream_regions) == 0:
+                    upstream_regions.append((genes[gene - 1].location.end, genes[gene].location.start))
+                elif genes[gene].location.strand == genes[gene - 1].location.strand:
+                    if abs(genes[gene].location.start - genes[gene - 1].location.end) <= self.blast_param["min_intergenic_distance"]:
+                        if abs(genes[gene].location.start - genes[gene - 1].location.end) > 0:
+                            upstream_regions.append((genes[gene - 1].location.end, genes[gene].location.start))
+                    else:
+                        break
                 else:
-                    the_end = coding_intervals[i][0]
                     break
+                gene -= 1
+        else:
+            # starts at beginning of list of features since we are going upstream on reverse strand
+            gene = 0
+            # LATER make sure to implement a way to add 10,000 more when you dont reach the end
+            while gene < len(genes) - 1:
+                distance = genes[gene].location.start - genes[gene + 1].location.end
+                if len(upstream_regions) == 0:
+                    upstream_regions.append((genes[gene + 1].location.end, genes[gene].location.start))
+                elif genes[gene].location.strand == genes[gene + 1].location.strand:
+                    if distance <= self.blast_param["min_intergenic_distance"]:
+                        print("region: ")
+                        print(genes[gene+1].location.end, genes[gene].location.start)
+                        print("distance: " + str(genes[gene].location.start - genes[gene + 1].location.end))
+                        if (genes[gene].location.start - genes[gene + 1].location.end) > 0:
+                            upstream_regions.append((genes[gene + 1].location.end, genes[gene].location.start))
+                    else:
+                        break
+                else:
+                    break
+                gene += 1
 
+        print("Upstream Regions to Take: ")
         print(upstream_regions)
+
         # get sequences of each upstream region
+        upstream_sequences = []
         for i in range(len(upstream_regions)):
-            print("poop")
             upstream_sequence = sequence[upstream_regions[i][0]:upstream_regions[i][1]]
             upstream_sequences.append(SeqRecord(Seq.Seq(upstream_sequence), id=return_id, description=return_description))
 
-        print(upstream_sequences)
-        return upstream_sequences
+        print(upstream_sequences[0].seq.complement().reverse_complement())
+        return upstream_sequence
+
+    def percent_similarity(self, ortholog2):
+        # percent_similarity = (matches / matches + mismatches)
+        aligner = Align.PairwiseAligner()
+        alignments = aligner.align(self.nuc_seq, ortholog2.nuc_seq)
+        alignment = alignments[0]
+
+        # counts number of matches and mismatches to show percent similarity
+        matches = alignment.counts().identities
+        mismatches = alignment.counts().mismatches
+        per_similarity = (matches / (matches+mismatches))
+        return per_similarity
+
diff --git a/main.py b/main.py
@@ -29,16 +29,38 @@
     else:
         hits = the_input.reg_BLAST_search()
 
+    '''
     #hits = [({'prot_id': 'WP_011079176'}, 'WP_011079176.1')]
+    hits = [({'prot_id': 'WP_011079176'}, 'WP_272923659.1')]
     #hits = [({'prot_id': 'WP_011079176'}, 'WP_039469417.1')]
-    '''
-    hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]
+    #hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]
 
     # treats each hit from the blast search as a potential ortholog and adds to list of orthologs
     orthologs = []
     for hit in hits:
         orthologs.append(theOrtholog(the_input, hit))
 
+    # prepares an ortholog dictionary for later
+    orthologs_dict = {}
+    #goes through orthologs list to add to ortho dictionary and get records/sequences
     for potential_olog in orthologs:
+        orthologs_dict[potential_olog.hit] = ""
         potential_olog.get_nuc_rec() #gets nucleotide record for each ortholog
-        potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog
+        potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog
+
+    # goes through list of orthologs and does popping algorithm that marks sequences that are too similar
+    for first_ortholog in range(len(orthologs)):
+        compared_ortholog = first_ortholog+1 # compares first ortholog with ortholog after it
+        while compared_ortholog < len(orthologs):
+            # gets percent similarity and marks in dictionary if above the threshold
+            per_similarity = orthologs[first_ortholog].percent_similarity(orthologs[compared_ortholog])
+            if per_similarity < 8: #checks if ortholog is less
+                orthologs_dict[compared_ortholog.hit] = "DELETE"
+            compared_ortholog += 1
+
+    # goes through ortholog dictionary to finalize which orthologs can stay
+    for ortholog_key in orthologs_dict:
+        if orthologs_dict[ortholog_key] == "DELETE":
+            for ortholog in orthologs:
+                if (ortholog.hit == ortholog_key):
+                    orthologs.remove(ortholog)
diff --git a/pop_test.py b/pop_test.py
@@ -0,0 +1,41 @@
+import timeit
+
+if __name__ == "__main__":
+
+    my_list = [45, 23, 18, 23, 8, 39, 23, 29, 2, 11, 78, 7,
+               82, 33, 18, 28, 5, 54, 21, 61, 893, 20, 3]
+    print(my_list)
+
+    start = timeit.default_timer()
+
+    for i in range(len(my_list)):
+        j = i+1
+        while j < len(my_list):
+            if (my_list[i] - my_list[j]) < 8 and (my_list[i] - my_list[j]) > -8:
+                print(my_list[j], j)
+                my_list.remove(my_list[j])
+                j -= 1
+            j += 1
+
+    stop = timeit.default_timer()
+
+    print('Time: ', stop - start)
+
+    print(my_list)
+
+    my_list = [20, 12, 43, 6, 78, 30, 14, 23, 9, 30, 29, 10, 40, 62, 73, 4, 2, 35, 14, 22, 0, 10, 21, 80, 90,
+               79, 54, 2, 34, 26, 56, 87, 3, 20, 65, 34, 1, 98, 75, 57, 45, 23, 18, 23, 8, 39, 23, 29, 2, 11, 78, 7,
+               82, 33, 18, 28, 5, 54, 21, 61]
+    print(my_list)
+
+    '''
+    start = timeit.default_timer()
+
+    for i in range(len(my_list)):
+        j = i+1
+        while j < len(my_list):
+            if (my_list[i] - my_list[j]) < 8 and (my_list[i] - my_list[j]) > -8:
+                my_list[j] =
+            j += 1
+    '''
+