Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Timetree stuff #77

Open
wants to merge 2 commits into
base: v1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions data_processing/compare_trees.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from Bio import Phylo

tree1_path = "new2.nwk"
tree2_path = "done_bp.newick"
print("B")
tree1 = Phylo.read(tree1_path, "newick")
print("B")
tree2 = Phylo.read(tree2_path, "newick")
print("B")
tree1_preorder = tree1.find_clades()
tree2_preorder = tree2.find_clades()
print("B")
for node1 in tree1_preorder:
node2 = next(tree2_preorder)
if node1.name != node2.name:
# print(node1.name, node2.name)
pass
if node1.branch_length != node2.branch_length and node2.branch_length is not None:
print(node1.name, node2.name, node1.branch_length, node2.branch_length)
114 changes: 114 additions & 0 deletions data_processing/get_tree_from_proto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import dendropy
import parsimony_pb2, tqdm

from Bio import Phylo


class UsherMutationAnnotatedTree:
def __init__(self, tree_file):
self.data = parsimony_pb2.data()
self.data.ParseFromString(tree_file.read())
self.condensed_nodes_dict = self.get_condensed_nodes_dict(
self.data.condensed_nodes)

self.tree = dendropy.Tree.get(data=self.data.newick, schema="newick")
self.annotate_mutations()
self.set_branch_lengths()
self.expand_condensed_nodes()

def annotate_mutations(self):
for i, node in enumerate(self.tree.preorder_node_iter()):
node.nuc_mutations = self.data.node_mutations[i]

def expand_condensed_nodes(self):
for i, node in tqdm.tqdm(enumerate(self.tree.leaf_nodes()),
desc="Expanding condensed nodes"):

if node.taxon and node.taxon.label in self.condensed_nodes_dict:
assert node.edge_length == 0
for new_node_label in self.condensed_nodes_dict[
node.taxon.label]:
new_node = dendropy.Node(
taxon=dendropy.Taxon(new_node_label))

node.parent_node.add_child(new_node)
node.parent_node.remove_child(node)

def get_condensed_nodes_dict(self, condensed_nodes_dict):
output_dict = {}
for condensed_node in tqdm.tqdm(condensed_nodes_dict,
desc="Reading condensed nodes dict"):
output_dict[condensed_node.node_name.replace(
"_", " ")] = condensed_node.condensed_leaves
return output_dict

def set_branch_lengths(self):
for i, node in enumerate(self.tree.preorder_node_iter()):
node.edge_length = len(node.nuc_mutations.mutation)


import io


def get_parent(tree, child_clade):
node_path = tree.get_path(child_clade)
return node_path[-2]


# The same but using BioPython
class UsherMutationAnnotatedTreeBioPython:
def __init__(self, tree_file):
self.data = parsimony_pb2.data()
self.data.ParseFromString(tree_file.read())
self.condensed_nodes_dict = self.get_condensed_nodes_dict(
self.data.condensed_nodes)
self.tree = Phylo.read(io.StringIO(self.data.newick), "newick")
#print("aa", self.condensed_nodes_dict)
self.annotate_mutations()
self.set_branch_lengths()
self.name_nodes()
self.expand_condensed_nodes()

def annotate_mutations(self):
for i, node in enumerate(self.tree.find_clades()):
node.nuc_mutations = self.data.node_mutations[i]

def name_nodes(self):
for i, node in enumerate(self.tree.find_clades()):
if not node.name:
node.name = f"node_{i}"

def expand_condensed_nodes(self):
for i, parent in tqdm.tqdm(enumerate(self.tree.find_clades()),
desc="Expanding condensed nodes"):
for node in parent.clades:
if node.name in self.condensed_nodes_dict:
assert node.branch_length == 0
for new_node_label in self.condensed_nodes_dict[node.name]:
new_node = Phylo.BaseTree.Clade(name=new_node_label)
parent.clades.append(new_node)
parent.clades.remove(node)
else:
# print(node.name)
pass

def get_condensed_nodes_dict(self, condensed_nodes_dict):
output_dict = {}
for condensed_node in tqdm.tqdm(condensed_nodes_dict,
desc="Reading condensed nodes dict"):
output_dict[
condensed_node.node_name] = condensed_node.condensed_leaves
return output_dict

def set_branch_lengths(self):
for i, node in enumerate(self.tree.find_clades()):
node.branch_length = len(node.nuc_mutations.mutation)


# f = open("./public-2021-09-15.all.masked.pb", "rb")
# mat = UsherMutationAnnotatedTree(f)
# mat.tree.write(path="./done.newick", schema="newick")

f = open("./public-2021-09-15.all.masked.pb", "rb")
mat = UsherMutationAnnotatedTreeBioPython(f)
Phylo.write(mat.tree, "./done_bp.newick", "newick")
34 changes: 20 additions & 14 deletions data_processing/parsimony_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading