Skip to content

Commit

Permalink
changed to the new documentation and changed some stuff around to mak…
Browse files Browse the repository at this point in the history
…e it look nicer
  • Loading branch information
hannahn281 committed Mar 16, 2023
1 parent 1621736 commit 2d1d9b9
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 51 deletions.
28 changes: 28 additions & 0 deletions align_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from Bio import Align

def align_sequence():
aligner = Align.PairwiseAligner(mode='global', match_score=2, mismatch_score=-1)
alignments = aligner.align("GTACACACTC", "ATACAATATAT")
alignment = alignments[0]
matches = alignment.counts().identities
mismatches = alignment.counts().mismatches
gaps = alignment.counts().gaps
print(alignment)
print(matches)
print(mismatches)
print(gaps)

'''
aligner2 = Align.PairwiseAligner()
aligner2.match_score = 0
aligner2.mismatch_score = 1
aligner2.gap_score = 0
alignments = aligner2.align("GTACACACTC", "ATACAATATAT")
alignment = alignments[0]
print(alignment)
print(alignment.score)
'''


if __name__ == "__main__":
align_sequence()
120 changes: 72 additions & 48 deletions class_orthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
# Date: 9/22/22
# Description: orthologs class

from Bio import Seq, Entrez, SeqIO
from Bio import Seq, Entrez, SeqIO, Align
from Bio.SeqRecord import SeqRecord
from Bio.Blast import NCBIWWW, NCBIXML
import time

class theOrtholog:
Expand Down Expand Up @@ -36,7 +35,7 @@ def get_nuc_rec(self):
time.sleep(self.entrez_param["sleep_time"])
break
except:
print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
print("\t\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")
Expand Down Expand Up @@ -87,6 +86,9 @@ def get_nuc_rec(self):
if genome["p_score"] > max_p_record["p_score"]:
max_p_record = genome

print("Record Name: " + max_p_record['acc'])
print("Original Region: " + str(max_p_record['start']) + ", " + str(max_p_record['stop']))

print("\tReturning gene record for " + self.hit[1] + ". p-score: " +
str(max_p_record["p_score"]))
self.nuc_rec = max_p_record
Expand All @@ -109,13 +111,14 @@ def get_nuc_seq(self):
s_start = int(self.nuc_rec['start']) - self.blast_param["base_pairs_to_grab"]
if s_start <= 0:
s_start = 1
s_stop = int(self.nuc_rec['stop'])
s_stop = int(self.nuc_rec['start'])
s_strand = 1
else:
s_stop = int(self.nuc_rec['stop'])
s_start = int(self.nuc_rec['start']) + self.blast_param["base_pairs_to_grab"]
s_stop = int(self.nuc_rec['stop']) + self.blast_param["base_pairs_to_grab"]
s_start = int(self.nuc_rec['stop'])
s_strand = 2

print("Region after going upstream: " + str(s_start) + ", " + str(s_stop))
print("\t\tGetting genbank record...")

# Fetch and read the annotated GenBank record
Expand All @@ -128,78 +131,99 @@ def get_nuc_seq(self):
genome_record = SeqIO.read(handle, "genbank")
break
except:
print("\t\tNCBI exception raised on attempt " + str(i) + "\n\t\treattempting now...")
print("\t\tNCBI exception raised on attempt " + str(i+1) + "\n\t\treattempting now...")
time.sleep(self.entrez_param["sleep_time"])
if i == (self.entrez_param["retry_number"] - 1):
print("\t\tCould not download record after " + str(self.entrez_param["retry_number"]) + " attempts")

sequence = genome_record.seq

print(genome_record.features)

'''
print("\t\tParsing intervals for coding regions...")
# Find all coding regions in the returned GenBank sequence.
coding_intervals = []
sequence = genome_record[0]['GBSeq_sequence']
for feature in genome_record[0]['GBSeq_feature-table']:
if feature['GBFeature_key'] == 'gene':
if feature['GBFeature_location'].startswith("complement"):
strand = 2
else:
strand = 1
if "GBInterval_from" in feature['GBFeature_intervals'][0]:
coding_start = feature['GBFeature_intervals'][0]['GBInterval_from']
coding_end = feature['GBFeature_intervals'][0]['GBInterval_to']
coding_intervals.append((int(coding_start), int(coding_end), strand))
print(coding_intervals)
#print(genome_record.features)

# setting up return id
return_id = "p_" + str(self.nuc_rec['acc'])

#Setting up the description for the FASTA record
return_description = "Original_Query_Protein " + str(self.hit[0]) + " BLAST_Hit_Accession " + str(self.hit[1])

'''
# If there is only one coding region in the selected sequence, then
# the sequence is returned unmodified.
if len(coding_intervals) == 1:
gb_features = genome_record.features
if len(gb_features) == 1:
# Appends information to record description
print("\t\tOnly one coding regions found. Returning full sequence...")
return SeqRecord(Seq.Seq(sequence), id=return_id, description=return_description)

# If no coding intervals are indentified, None is returned.
elif len(coding_intervals) == 0:
elif len(gb_features) == 0:
print("\t\t|~> No coding intervals found for record: " + str(self.nuc_rec['acc']) + ".")
return None

the_end = 0 #for now
upstream_regions = []
upstream_sequences = []
genes = []
for i in range(len(gb_features)):
if gb_features[i].type == 'gene':
genes.append(gb_features[i])

# LATER make sure to implement a way to add 10,000 more when you dont reach the end
for i in range(len(coding_intervals)):
if i > 0:
print("poop ye " + str(coding_intervals[i][0] - coding_intervals[i-1][1]))
if coding_intervals[i][0] - coding_intervals[i-1][1] <= self.blast_param["min_intergenic_distance"]:
if coding_intervals[i][0] - coding_intervals[i-1][1] > 0:
upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
elif coding_intervals[i][2] == coding_intervals[i-1][2]:
if coding_intervals[i][1] - coding_intervals[i-1][0] > 0:
upstream_regions.append((coding_intervals[i-1][1], coding_intervals[i][0]))
upstream_regions = []
if self.nuc_rec['strand'] == '+':
# starts at end of list of features since we are going upstream
gene = len(genes) - 1
# LATER make sure to implement a way to add 10,000 more when you dont reach the end
while gene > 0:
print("here")
if len(upstream_regions) == 0:
upstream_regions.append((genes[gene - 1].location.end, genes[gene].location.start))
elif genes[gene].location.strand == genes[gene - 1].location.strand:
if abs(genes[gene].location.start - genes[gene - 1].location.end) <= self.blast_param["min_intergenic_distance"]:
if abs(genes[gene].location.start - genes[gene - 1].location.end) > 0:
upstream_regions.append((genes[gene - 1].location.end, genes[gene].location.start))
else:
break
else:
the_end = coding_intervals[i][0]
break
gene -= 1
else:
# starts at beginning of list of features since we are going upstream on reverse strand
gene = 0
# LATER make sure to implement a way to add 10,000 more when you dont reach the end
while gene < len(genes) - 1:
distance = genes[gene].location.start - genes[gene + 1].location.end
if len(upstream_regions) == 0:
upstream_regions.append((genes[gene + 1].location.end, genes[gene].location.start))
elif genes[gene].location.strand == genes[gene + 1].location.strand:
if distance <= self.blast_param["min_intergenic_distance"]:
print("region: ")
print(genes[gene+1].location.end, genes[gene].location.start)
print("distance: " + str(genes[gene].location.start - genes[gene + 1].location.end))
if (genes[gene].location.start - genes[gene + 1].location.end) > 0:
upstream_regions.append((genes[gene + 1].location.end, genes[gene].location.start))
else:
break
else:
break
gene += 1

print("Upstream Regions to Take: ")
print(upstream_regions)

# get sequences of each upstream region
upstream_sequences = []
for i in range(len(upstream_regions)):
print("poop")
upstream_sequence = sequence[upstream_regions[i][0]:upstream_regions[i][1]]
upstream_sequences.append(SeqRecord(Seq.Seq(upstream_sequence), id=return_id, description=return_description))

print(upstream_sequences)
return upstream_sequences
print(upstream_sequences[0].seq.complement().reverse_complement())
return upstream_sequence

def percent_similarity(self, ortholog2):
# percent_similarity = (matches / matches + mismatches)
aligner = Align.PairwiseAligner()
alignments = aligner.align(self.nuc_seq, ortholog2.nuc_seq)
alignment = alignments[0]

# counts number of matches and mismatches to show percent similarity
matches = alignment.counts().identities
mismatches = alignment.counts().mismatches
per_similarity = (matches / (matches+mismatches))
return per_similarity

28 changes: 25 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,38 @@
else:
hits = the_input.reg_BLAST_search()
'''
#hits = [({'prot_id': 'WP_011079176'}, 'WP_011079176.1')]
hits = [({'prot_id': 'WP_011079176'}, 'WP_272923659.1')]
#hits = [({'prot_id': 'WP_011079176'}, 'WP_039469417.1')]
'''
hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]
#hits = [({'prot_id': 'WP_011079176'}, 'WP_053542922.1')]

# treats each hit from the blast search as a potential ortholog and adds to list of orthologs
orthologs = []
for hit in hits:
orthologs.append(theOrtholog(the_input, hit))

# prepares an ortholog dictionary for later
orthologs_dict = {}
#goes through orthologs list to add to ortho dictionary and get records/sequences
for potential_olog in orthologs:
orthologs_dict[potential_olog.hit] = ""
potential_olog.get_nuc_rec() #gets nucleotide record for each ortholog
potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog
potential_olog.get_nuc_seq() #get nucleotide sequence from the record for each ortholog

# goes through list of orthologs and does popping algorithm that marks sequences that are too similar
for first_ortholog in range(len(orthologs)):
compared_ortholog = first_ortholog+1 # compares first ortholog with ortholog after it
while compared_ortholog < len(orthologs):
# gets percent similarity and marks in dictionary if above the threshold
per_similarity = orthologs[first_ortholog].percent_similarity(orthologs[compared_ortholog])
if per_similarity < 8: #checks if ortholog is less
orthologs_dict[compared_ortholog.hit] = "DELETE"
compared_ortholog += 1

# goes through ortholog dictionary to finalize which orthologs can stay
for ortholog_key in orthologs_dict:
if orthologs_dict[ortholog_key] == "DELETE":
for ortholog in orthologs:
if (ortholog.hit == ortholog_key):
orthologs.remove(ortholog)
41 changes: 41 additions & 0 deletions pop_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import timeit

if __name__ == "__main__":

my_list = [45, 23, 18, 23, 8, 39, 23, 29, 2, 11, 78, 7,
82, 33, 18, 28, 5, 54, 21, 61, 893, 20, 3]
print(my_list)

start = timeit.default_timer()

for i in range(len(my_list)):
j = i+1
while j < len(my_list):
if (my_list[i] - my_list[j]) < 8 and (my_list[i] - my_list[j]) > -8:
print(my_list[j], j)
my_list.remove(my_list[j])
j -= 1
j += 1

stop = timeit.default_timer()

print('Time: ', stop - start)

print(my_list)

my_list = [20, 12, 43, 6, 78, 30, 14, 23, 9, 30, 29, 10, 40, 62, 73, 4, 2, 35, 14, 22, 0, 10, 21, 80, 90,
79, 54, 2, 34, 26, 56, 87, 3, 20, 65, 34, 1, 98, 75, 57, 45, 23, 18, 23, 8, 39, 23, 29, 2, 11, 78, 7,
82, 33, 18, 28, 5, 54, 21, 61]
print(my_list)

'''
start = timeit.default_timer()
for i in range(len(my_list)):
j = i+1
while j < len(my_list):
if (my_list[i] - my_list[j]) < 8 and (my_list[i] - my_list[j]) > -8:
my_list[j] =
j += 1
'''

0 comments on commit 2d1d9b9

Please sign in to comment.