Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uniprot sourcename fix #108

Merged
merged 11 commits into from
Feb 8, 2024
1 change: 1 addition & 0 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@
ORGANISM_TO_ENZYME_EDGE = "biolink:expresses"
ENZYME_CATEGORY = "biolink:Enzyme"
CHEMICAL_TO_ENZYME_EDGE = "biolink:binds_to"
UNIPROT_GENOME_FEATURES = "uniprot_genome_features"
UNIPROT_BASE_URL = "https://rest.uniprot.org/uniprotkb/"
UNIPROT_FIELDS = ["organism_id", "id", "accession", "protein_name", "ec", "ft_binding"]
UNIPROT_KEYWORDS = ["Reference+proteome"]
Expand Down
23 changes: 21 additions & 2 deletions kg_microbe/transform_utils/uniprot/uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
ENZYME_CATEGORY,
NCBITAXON_PREFIX,
ORGANISM_TO_ENZYME_EDGE,
UNIPROT_GENOME_FEATURES,
UNIPROT_ORG_ID_COLUMN_NAME,
UNIPROT_PREFIX,
)
Expand Down Expand Up @@ -43,7 +44,7 @@ def __init__(self, input_dir: Optional[Path] = None, output_dir: Optional[Path]
"""
self.__enz_data = {}

source_name = "uniprot_genome_features"
source_name = UNIPROT_GENOME_FEATURES
super().__init__(source_name, input_dir, output_dir)

def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
Expand All @@ -70,7 +71,6 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None):
input_dir, ncbi_organisms, self.source_name, node_writer, edge_writer
)


drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)

Expand Down Expand Up @@ -152,6 +152,25 @@ def write_to_df(self, uniprot_values, edge_writer, node_writer):
else None
)

# Use primary accession number as it's ID does not change, as opposed to Entry Name
if "Entry" in entry.keys():
self.__enz_data["id"] = entry["Entry"]

# example response with multiple protein names:
# {
# "Organism (ID)": "100",
# "Entry Name": "A0A4R1H4N5_ANCAQ",
# "Entry": "A0A4R1H4N5",
# "Protein names": "Ubiquinone biosynthesis O-methyltransferase
# (2-polyprenyl-6-hydroxyphenol methylase) (EC 2.1.1.222)
# (3-demethylubiquinone 3-O-methyltransferase) (EC 2.1.1.64)",
# "EC number": "2.1.1.222; 2.1.1.64",
# }
if "Protein names" in entry:
self.__enz_data["name"] = entry["Protein names"].split("(EC")[0]

organism_id = entry["Organism (ID)"] if "Organism (ID)" in entry.keys() else None

# Use primary accession number as it's ID does not change, as opposed to Entry Name
if "Entry" in entry.keys():
self.__enz_data["id"] = entry["Entry"]
Expand Down
8 changes: 4 additions & 4 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ merged_graph:
filename:
- data/transformed/traits/nodes.tsv
- data/transformed/traits/edges.tsv
uniprot:
uniprot_genome_features:
input:
name: "uniprot"
name: "uniprot_genome_features"
format: tsv
filename:
- data/transformed/uniprot/nodes.tsv
- data/transformed/uniprot/edges.tsv
- data/transformed/uniprot_genome_features/nodes.tsv
- data/transformed/uniprot_genome_features/edges.tsv
operations:
- name: kgx.graph_operations.summarize_graph.generate_graph_stats
args:
Expand Down
Loading