From a3f67c7334a4e2f5e03fc82baf99a8a3e6ea2b72 Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Wed, 4 Dec 2024 16:46:55 -0800 Subject: [PATCH 1/2] Track all individual root types; use them to filter input/output associations --- src/gocam/translation/minerva_wrapper.py | 197 ++++++++++++----------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/src/gocam/translation/minerva_wrapper.py b/src/gocam/translation/minerva_wrapper.py index 25136cd..ed24b1e 100644 --- a/src/gocam/translation/minerva_wrapper.py +++ b/src/gocam/translation/minerva_wrapper.py @@ -1,7 +1,7 @@ import logging from collections import defaultdict from dataclasses import dataclass, field -from typing import DefaultDict, Dict, Iterator, List, Optional, Set, Tuple +from typing import DefaultDict, Dict, Iterator, List, Optional, Tuple import requests import yaml @@ -74,19 +74,14 @@ def _setattr_with_warning(obj, attr, value): setattr(obj, attr, value) -MAIN_TYPES = [ - "molecular_function", - "biological_process", - "cellular_component", - "information biomacromolecule", - "evidence", - "chemical entity", - "anatomical entity", -] - -COMPLEX_TYPES = [ - "protein-containing complex", -] +MOLECULAR_FUNCTION = "GO:0003674" +BIOLOGICAL_PROCESS = "GO:0008150" +CELLULAR_COMPONENT = "GO:0005575" +INFORMATION_BIOMACROMOLECULE = "CHEBI:33695" +PROTEIN_CONTAINING_COMPLEX = "GO:0032991" +EVIDENCE = "ECO:0000000" +CHEMICAL_ENTITY = "CHEBI:24431" +ANATOMICAL_ENTITY = "UBERON:0001062" @dataclass @@ -176,26 +171,14 @@ def minerva_object_to_model(obj: Dict) -> Model: # Bookkeeping variables # individual ID to "root" type / category, e.g Evidence, BP - individual_to_type: Dict[str, Optional[str]] = {} + individual_to_root_types: Dict[str, List[str]] = {} individual_to_term: Dict[str, str] = {} individual_to_annotations: Dict[str, Dict] = {} - complex_individuals: Set[str] = set() - id2obj: Dict[str, Dict] = {} + objects_by_id: Dict[str, Dict] = {} activities: List[Activity] = [] activities_by_mf_id: DefaultDict[str, List[Activity]] = defaultdict(list) facts_by_property: DefaultDict[str, List[Dict]] = defaultdict(list) - def _cls(obj: Dict) -> Optional[str]: - if obj.get("type", None) == "complement": - logger.warning(f"Ignoring Complement: {obj}") - # class expression representing NOT - return None - if "id" not in obj: - raise ValueError(f"No ID for {obj}") - id = obj["id"] - id2obj[id] = obj - return id - def _evidence_from_fact(fact: Dict) -> List[EvidenceItem]: anns_mv = _annotations_multivalued(fact) evidence_inst_ids = anns_mv.get("evidence", []) @@ -227,41 +210,40 @@ def _iter_activities_by_fact_subject( fact_property: str, ) -> Iterator[Tuple[Activity, str, List[EvidenceItem]]]: for fact in facts_by_property.get(fact_property, []): - s, o = fact["subject"], fact["object"] - if o not in individual_to_term: - logger.warning(f"Missing {o} in {individual_to_term}") + subject, object_ = fact["subject"], fact["object"] + if object_ not in individual_to_term: + logger.warning(f"Missing {object_} in {individual_to_term}") continue - for activity in activities_by_mf_id.get(s, []): + for activity in activities_by_mf_id.get(subject, []): evs = _evidence_from_fact(fact) - yield activity, individual_to_term[o], evs + yield activity, object_, evs + + def _has_molecule_root_type(individual_id: str) -> bool: + root_types = individual_to_root_types.get(individual_id, []) + return ( + CHEMICAL_ENTITY in root_types + and INFORMATION_BIOMACROMOLECULE not in root_types + ) for individual in obj["individuals"]: - typs = [x["label"] for x in individual.get("root-type", []) if x] - typ: Optional[str] = None - for t in typs: - if t in MAIN_TYPES: - typ = t - break - if not typ: - logger.warning(f"Could not find type for {individual}") - continue - individual_to_type[individual["id"]] = typ - - # Check to see if one of the types is a complex type - for t in typs: - if t in COMPLEX_TYPES: - complex_individuals.add(individual["id"]) - break - - terms = list(filter(None, (_cls(x) for x in individual.get("type", [])))) - if len(terms) > 1: - logger.warning(f"Multiple terms for {individual}: {terms}") - if not terms: - logger.warning(f"No terms for {individual}") - continue - individual_to_term[individual["id"]] = terms[0] - anns = _annotations(individual) - individual_to_annotations[individual["id"]] = anns + root_types = [x["id"] for x in individual.get("root-type", []) if x] + individual_to_root_types[individual["id"]] = root_types + + term_id: Optional[str] = None + for type_ in individual.get("type", []): + if type_.get("type") == "complement": + # class expression representing NOT + continue + type_id = type_.get("id") + if type_id is None: + continue + objects_by_id[type_id] = type_ + term_id = type_id + + individual_to_term[individual["id"]] = term_id + if "annotations" in individual: + anns = _annotations(individual) + individual_to_annotations[individual["id"]] = anns for fact in obj["facts"]: facts_by_property[fact["property"]].append(fact) @@ -270,22 +252,23 @@ def _iter_activities_by_fact_subject( if not enabled_by_facts: raise ValueError(f"Missing {ENABLED_BY} in {facts_by_property}") for fact in enabled_by_facts: - s, o = fact["subject"], fact["object"] - if s not in individual_to_term: - logger.warning(f"Missing {s} in {individual_to_term}") + subject, object_ = fact["subject"], fact["object"] + if subject not in individual_to_term: + logger.warning(f"Missing {subject} in {individual_to_term}") continue - if o not in individual_to_term: - logger.warning(f"Missing {o} in {individual_to_term}") + if object_ not in individual_to_term: + logger.warning(f"Missing {object_} in {individual_to_term}") continue - gene_id = individual_to_term[o] + gene_id = individual_to_term[object_] + root_types = individual_to_root_types.get(object_, []) evs = _evidence_from_fact(fact) enabled_by_association: EnabledByAssociation - if o in complex_individuals: + if PROTEIN_CONTAINING_COMPLEX in root_types: has_part_facts = [ fact for fact in facts_by_property.get(HAS_PART, []) - if fact["subject"] == o + if fact["subject"] == object_ ] members = [ individual_to_term[fact["object"]] @@ -295,72 +278,95 @@ def _iter_activities_by_fact_subject( enabled_by_association = EnabledByProteinComplexAssociation( term=gene_id, members=members ) - else: + elif INFORMATION_BIOMACROMOLECULE in root_types: enabled_by_association = EnabledByGeneProductAssociation(term=gene_id) + else: + continue + activity = Activity( - id=s, + id=subject, enabled_by=enabled_by_association, molecular_function=MolecularFunctionAssociation( - term=individual_to_term[s], evidence=evs + term=individual_to_term[subject], evidence=evs ), ) activities.append(activity) - activities_by_mf_id[s].append(activity) + activities_by_mf_id[subject].append(activity) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=PART_OF ): - association = BiologicalProcessAssociation(term=term, evidence=evs) + association = BiologicalProcessAssociation( + term=individual_to_term[individual], evidence=evs + ) _setattr_with_warning(activity, "part_of", association) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=OCCURS_IN ): - association = CellularAnatomicalEntityAssociation(term=term, evidence=evs) + association = CellularAnatomicalEntityAssociation( + term=individual_to_term[individual], evidence=evs + ) _setattr_with_warning(activity, "occurs_in", association) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=HAS_INPUT ): + if not _has_molecule_root_type(individual): + continue if activity.has_input is None: activity.has_input = [] - activity.has_input.append(MoleculeAssociation(term=term, evidence=evs)) + activity.has_input.append( + MoleculeAssociation(term=individual_to_term[individual], evidence=evs) + ) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=HAS_PRIMARY_INPUT ): - association = MoleculeAssociation(term=term, evidence=evs) + if not _has_molecule_root_type(individual): + continue + association = MoleculeAssociation( + term=individual_to_term[individual], evidence=evs + ) _setattr_with_warning(activity, "has_primary_input", association) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=HAS_OUTPUT ): + if not _has_molecule_root_type(individual): + continue if activity.has_output is None: activity.has_output = [] - activity.has_output.append(MoleculeAssociation(term=term, evidence=evs)) + activity.has_output.append( + MoleculeAssociation(term=individual_to_term[individual], evidence=evs) + ) - for activity, term, evs in _iter_activities_by_fact_subject( + for activity, individual, evs in _iter_activities_by_fact_subject( fact_property=HAS_PRIMARY_OUTPUT ): - association = MoleculeAssociation(term=term, evidence=evs) + if not _has_molecule_root_type(individual): + continue + association = MoleculeAssociation( + term=individual_to_term[individual], evidence=evs + ) _setattr_with_warning(activity, "has_primary_output", association) for fact_property, facts in facts_by_property.items(): for fact in facts: - s, o = fact["subject"], fact["object"] - subject_activities = activities_by_mf_id.get(s, []) - object_activities = activities_by_mf_id.get(o, []) + subject, object_ = fact["subject"], fact["object"] + subject_activities = activities_by_mf_id.get(subject, []) + object_activities = activities_by_mf_id.get(object_, []) if not subject_activities or not object_activities: continue - if individual_to_type.get(s, None) != "molecular_function": + if MOLECULAR_FUNCTION not in individual_to_root_types.get(subject, []): continue - if individual_to_type.get(o, None) != "molecular_function": + if MOLECULAR_FUNCTION not in individual_to_root_types.get(object_, []): continue if len(subject_activities) > 1: - logger.warning(f"Multiple activities for subject: {s}") + logger.warning(f"Multiple activities for subject: {subject}") if len(object_activities) > 1: - logger.warning(f"Multiple activities for object: {o}") + logger.warning(f"Multiple activities for object: {object_}") subject_activity = subject_activities[0] object_activity = object_activities[0] @@ -376,7 +382,14 @@ def _iter_activities_by_fact_subject( annotations = _annotations(obj) annotations_mv = _annotations_multivalued(obj) - objs = [Object(id=obj["id"], label=obj["label"]) for obj in id2obj.values()] + + objects: List[Object] = [] + for obj in objects_by_id.values(): + object_ = Object(id=obj["id"]) + if "label" in obj: + object_.label = obj["label"] + objects.append(object_) + cam = Model( id=id, title=annotations["title"], @@ -384,6 +397,6 @@ def _iter_activities_by_fact_subject( comments=annotations_mv.get("comment", None), taxon=annotations.get("in_taxon", None), activities=activities, - objects=objs, + objects=objects, ) return cam From 53bd9396674d3478acdad6c2ef3c8b6049a2a73f Mon Sep 17 00:00:00 2001 From: Patrick Kalita Date: Wed, 4 Dec 2024 16:47:08 -0800 Subject: [PATCH 2/2] Update test input files --- tests/input/Model-63f809ec00000701.yaml | 87 +++++++------------------ tests/input/Model-6606056e00002011.yaml | 59 ----------------- 2 files changed, 22 insertions(+), 124 deletions(-) diff --git a/tests/input/Model-63f809ec00000701.yaml b/tests/input/Model-63f809ec00000701.yaml index eda5ec3..c3ce4ca 100644 --- a/tests/input/Model-63f809ec00000701.yaml +++ b/tests/input/Model-63f809ec00000701.yaml @@ -5,41 +5,9 @@ title: tRNA repair and recycling by ANKZF1, ELAC1 and TRNT1 following activity o taxon: NCBITaxon:9606 status: production activities: -- id: gomodel:63f809ec00000701/63f809ec00000742 - enabled_by: - type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] - term: UniProtKB:Q96Q11 - molecular_function: - type: MolecularFunctionAssociation - evidence: - - term: ECO:0000314 - reference: PMID:32075755 - provenances: - - contributor: https://orcid.org/0000-0001-7299-6685 - date: '2023-03-01' - provenances: [] - term: GO:0004810 - part_of: - type: BiologicalProcessAssociation - evidence: - - term: ECO:0000314 - reference: PMID:32075755 - provenances: - - contributor: https://orcid.org/0000-0001-7299-6685 - date: '2023-03-01' - provenances: [] - term: GO:0001680 - has_input: [] - has_output: [] - causal_associations: [] - provenances: [] - id: gomodel:63f809ec00000701/63f809ec00000726 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:Q9H8Y5 molecular_function: type: MolecularFunctionAssociation @@ -49,7 +17,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0004521 part_of: type: BiologicalProcessAssociation @@ -59,9 +26,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0072344 - has_input: [] has_output: - type: MoleculeAssociation evidence: @@ -70,7 +35,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: CHEBI:10668 causal_associations: - type: CausalAssociation @@ -80,15 +44,33 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] predicate: RO:0002629 downstream_activity: gomodel:63f809ec00000701/63f809ec00000735 - provenances: [] +- id: gomodel:63f809ec00000701/63f809ec00000742 + enabled_by: + type: EnabledByGeneProductAssociation + term: UniProtKB:Q96Q11 + molecular_function: + type: MolecularFunctionAssociation + evidence: + - term: ECO:0000314 + reference: PMID:32075755 + provenances: + - contributor: https://orcid.org/0000-0001-7299-6685 + date: '2023-03-01' + term: GO:0004810 + part_of: + type: BiologicalProcessAssociation + evidence: + - term: ECO:0000314 + reference: PMID:32075755 + provenances: + - contributor: https://orcid.org/0000-0001-7299-6685 + date: '2023-03-01' + term: GO:0001680 - id: gomodel:63f809ec00000701/63f809ec00000735 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:Q9H777 molecular_function: type: MolecularFunctionAssociation @@ -98,7 +80,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0004549 part_of: type: BiologicalProcessAssociation @@ -108,7 +89,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0042780 has_input: - type: MoleculeAssociation @@ -118,9 +98,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: CHEBI:10668 - has_output: [] causal_associations: - type: CausalAssociation evidence: @@ -129,15 +107,11 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] predicate: RO:0002629 downstream_activity: gomodel:63f809ec00000701/63f809ec00000742 - provenances: [] - id: gomodel:63f809ec00000701/63f809ec00000706 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:O60524 molecular_function: type: MolecularFunctionAssociation @@ -147,7 +121,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:1904678 occurs_in: type: CellularAnatomicalEntityAssociation @@ -157,7 +130,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0022626 part_of: type: BiologicalProcessAssociation @@ -167,19 +139,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-01' - provenances: [] term: GO:0140708 - has_input: - - type: MoleculeAssociation - evidence: - - term: ECO:0000314 - reference: PMID:33909987 - provenances: - - contributor: https://orcid.org/0000-0001-7299-6685 - date: '2023-03-01' - provenances: [] - term: CHEBI:17732 - has_output: [] causal_associations: - type: CausalAssociation evidence: @@ -188,10 +148,8 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7299-6685 date: '2023-03-02' - provenances: [] predicate: RO:0002304 downstream_activity: gomodel:63f809ec00000701/63f809ec00000726 - provenances: [] objects: - id: GO:1904678 label: alpha-aminoacyl-tRNA binding @@ -241,5 +199,4 @@ objects: - id: GO:0001680 label: tRNA 3'-terminal CCA addition type: gocam:Object -provenances: [] diff --git a/tests/input/Model-6606056e00002011.yaml b/tests/input/Model-6606056e00002011.yaml index 4b3be1a..66b03e0 100644 --- a/tests/input/Model-6606056e00002011.yaml +++ b/tests/input/Model-6606056e00002011.yaml @@ -7,8 +7,6 @@ activities: - id: gomodel:6606056e00002011/662af8fa00002857 enabled_by: type: EnabledByProteinComplexAssociation - evidence: [] - provenances: [] term: GO:0019815 members: - UniProtKB:P40259 @@ -23,7 +21,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-05-07' - provenances: [] term: GO:0004888 occurs_in: type: CellularAnatomicalEntityAssociation @@ -33,7 +30,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-05-07' - provenances: [] term: GO:0005886 part_of: type: BiologicalProcessAssociation @@ -45,21 +41,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-05-07' - provenances: [] term: GO:0050853 - has_input: - - type: MoleculeAssociation - evidence: - - term: ECO:0000250 - reference: GO_REF:0000024 - with_objects: - - MGI:96892 - provenances: - - contributor: https://orcid.org/0000-0001-7646-0052 - date: '2024-05-07' - provenances: [] - term: UniProtKB:P07948 - has_output: [] causal_associations: - type: CausalAssociation evidence: @@ -70,15 +52,11 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-05-07' - provenances: [] predicate: RO:0002629 downstream_activity: gomodel:6606056e00002011/6606056e00002040 - provenances: [] - id: gomodel:6606056e00002011/6606056e00002049 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:P29350 molecular_function: type: MolecularFunctionAssociation @@ -88,7 +66,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0004725 occurs_in: type: CellularAnatomicalEntityAssociation @@ -98,7 +75,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0005737 part_of: type: BiologicalProcessAssociation @@ -108,17 +84,10 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0050859 - has_input: [] - has_output: [] - causal_associations: [] - provenances: [] - id: gomodel:6606056e00002011/6606056e00002040 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:P07948 molecular_function: type: MolecularFunctionAssociation @@ -128,7 +97,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0004713 occurs_in: type: CellularAnatomicalEntityAssociation @@ -138,7 +106,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0005886 part_of: type: BiologicalProcessAssociation @@ -150,21 +117,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0001782 - has_input: - - type: MoleculeAssociation - evidence: - - term: ECO:0000250 - reference: GO_REF:0000024 - with_objects: - - MGI:96892 - provenances: - - contributor: https://orcid.org/0000-0001-7646-0052 - date: '2024-05-07' - provenances: [] - term: UniProtKB:P21854 - has_output: [] causal_associations: - type: CausalAssociation evidence: @@ -173,15 +126,11 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] predicate: RO:0002629 downstream_activity: gomodel:6606056e00002011/6606056e00002014 - provenances: [] - id: gomodel:6606056e00002011/6606056e00002014 enabled_by: type: EnabledByGeneProductAssociation - evidence: [] - provenances: [] term: UniProtKB:P21854 molecular_function: type: MolecularFunctionAssociation @@ -191,7 +140,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0004888 occurs_in: type: CellularAnatomicalEntityAssociation @@ -201,7 +149,6 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0005886 part_of: type: BiologicalProcessAssociation @@ -211,10 +158,7 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] term: GO:0050859 - has_input: [] - has_output: [] causal_associations: - type: CausalAssociation evidence: @@ -223,10 +167,8 @@ activities: provenances: - contributor: https://orcid.org/0000-0001-7646-0052 date: '2024-04-09' - provenances: [] predicate: RO:0002629 downstream_activity: gomodel:6606056e00002011/6606056e00002049 - provenances: [] objects: - id: GO:0005615 label: extracellular space @@ -282,5 +224,4 @@ objects: - id: CHEBI:166824 label: peptide antigen type: gocam:Object -provenances: []