From 06f1220066748df3dfd707b13caa2c41ddf6590c Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Thu, 28 Mar 2024 12:02:30 -0700 Subject: [PATCH] synonymizer refactor2 (#728) * synonymizer refactoring. Moving synonymizer from mapping rules to its own data model. Moving logic from CLI to utils * Command line * Add missing --- Makefile | 5 + src/oaklib/cli.py | 148 ++-- .../datamodels/mapping_rules_datamodel.py | 82 +-- .../datamodels/mapping_rules_datamodel.yaml | 6 +- .../datamodels/synonymizer_datamodel.py | 697 ++++++++++++++++++ .../datamodels/synonymizer_datamodel.yaml | 94 +++ src/oaklib/datamodels/vocabulary.py | 7 + src/oaklib/utilities/kgcl_utilities.py | 2 + .../utilities/lexical/lexical_indexer.py | 34 +- src/oaklib/utilities/lexical/synonymizer.py | 145 ++++ tests/input/cli-synonymizer-rules.yaml | 13 + tests/input/matcher_rules.yaml | 4 +- tests/test_cli.py | 5 +- tests/test_utilities/test_lexical_index.py | 10 +- tests/test_utilities/test_synonymizer.py | 99 +++ 15 files changed, 1189 insertions(+), 162 deletions(-) create mode 100644 src/oaklib/datamodels/synonymizer_datamodel.py create mode 100644 src/oaklib/datamodels/synonymizer_datamodel.yaml create mode 100644 src/oaklib/utilities/lexical/synonymizer.py create mode 100644 tests/input/cli-synonymizer-rules.yaml create mode 100644 tests/test_utilities/test_synonymizer.py diff --git a/Makefile b/Makefile index 7e3b6ad85..0cc53b3fb 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,11 @@ src/oaklib/datamodels/%.py: src/oaklib/datamodels/%.yaml # $(RUN) gen-pydantic $< > $@.tmp && mv $@.tmp $@ $(RUN) gen-python $< > $@.tmp && mv $@.tmp $@ $(RUN) tox -e lint + +src/oaklib/datamodels/synonymizer.py: src/oaklib/datamodels/synonymizer.yaml + $(RUN) gen-pydantic $< > $@.tmp && mv $@.tmp $@ + + src/oaklib/datamodels/%.schema.json: src/oaklib/datamodels/%.yaml $(RUN) gen-json-schema $< > $@.tmp && mv $@.tmp $@ src/oaklib/datamodels/%.owl.ttl: src/oaklib/datamodels/%.yaml diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 0517fa76b..411b06614 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -52,9 +52,9 @@ import oaklib.datamodels.taxon_constraints as tcdm from oaklib import datamodels from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener +from oaklib.datamodels import synonymizer_datamodel from oaklib.datamodels.association import RollupGroup from oaklib.datamodels.cross_ontology_diff import DiffCategory -from oaklib.datamodels.lexical_index import LexicalTransformation, TransformationType from oaklib.datamodels.obograph import ( BasicPropertyValue, Edge, @@ -167,11 +167,9 @@ parse_kgcl_files, write_kgcl, ) -from oaklib.utilities.lexical import patternizer +from oaklib.utilities.lexical import patternizer, synonymizer from oaklib.utilities.lexical.lexical_indexer import ( - DEFAULT_QUALIFIER, add_labels_from_uris, - apply_transformation, create_lexical_index, lexical_index_to_sssom, load_lexical_index, @@ -6496,46 +6494,114 @@ def generate_synonyms(terms, rules_file, apply_patch, patch, patch_format, outpu else: writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl) writer.output = output - # TODO: Eventually get this from settings as above + ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file))) + change_list = [] + curie_iter = query_terms_iterator(terms, impl) + for change in synonymizer.apply_synonymizer_to_terms(impl, curie_iter, ruleset): + change_list.append(change) + writer.emit(change) + + writer.finish() + if apply_patch and len(change_list) > 0: + if output: + impl.resource.slug = output + _apply_changes(impl, change_list) + + +@main.command() +@click.argument("terms", nargs=-1) +@click.option( + "--rules-file", + "-R", + help="path to rules file. Conforms to rules_datamodel.\ + e.g. https://github.com/INCATools/ontology-access-kit/blob/main/tests/input/matcher_rules.yaml", +) +@click.option( + "--rules-expression", + "-Y", + multiple=True, + help="YAML encoding of a rules expression", +) +@click.option( + "--apply-patch/--no-apply-patch", + default=False, + show_default=True, + help="Apply KGCL syntax generated based on the synonymizer rules file.", +) +@click.option( + "--patch", + type=click.File(mode="w"), + default=sys.stdout, + help="Path to where patch file will be written.", +) +@click.option( + "--patch-format", + help="Output syntax for patches.", +) +@output_option +@output_type_option +def generate_lexical_replacements( + terms, rules_file, rules_expression, apply_patch, patch, patch_format, output, output_type +): + """ + Generate lexical replacements based on a set of synonymizer rules. + + + If the `--apply-patch` flag is set, the output will be an ontology file with the changes + applied. Pass the `--patch` argument to lso get the patch file in KGCL format. + + Example: + ------- + + runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml\ + --patch patch.kgcl --apply-patch -o foo_syn.obo + + If the `apply-patch` flag is NOT set then the main input will be KGCL commands + + Example: + ------- + + runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml -o changes.kgcl + + + You can also pass the expressions directly as YAML + + Example: + ------- + + runoak -i foo.obo generate-lexical-replacements \ + -Y '{match: "nuclear (\\w+)", replacement: "\\1 nucleus"}' .all + + see https://github.com/INCATools/kgcl. + + Note: this command is very similar to generate-synonyms, but the main use case here + is replacing terms, and applying rules to other elements such as definitions + + """ + impl = settings.impl + if apply_patch: + writer = _get_writer(patch_format, impl, StreamingKGCLWriter, kgcl) + writer.output = patch + else: + writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl) + writer.output = output if rules_file: - ruleset = load_mapping_rules(rules_file) + ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file))) + elif rules_expression: + ruleset = synonymizer_datamodel.RuleSet() + for rule_expression in rules_expression: + rule = synonymizer_datamodel.Synonymizer(**yaml.safe_load(rule_expression)) + ruleset.rules.append(rule) else: - ruleset = None - if not isinstance(impl, OboGraphInterface): - raise NotImplementedError - syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer] - terms_to_synonymize = {} + raise ValueError("Must specify either --rules-file or --rules-expression") change_list = [] - for curie in query_terms_iterator(terms, impl): - # for rule in syn_rules: - for _, aliases in impl.entity_alias_map(curie).items(): - matches = [] - if aliases is not None: - # matches.extend([x for x in aliases if re.search(eval(rule.match), x) is not None]) - for alias in aliases: - if alias: - synonymized, new_alias, qualifier = apply_transformation( - alias, - LexicalTransformation( - TransformationType.Synonymization, params=syn_rules - ), - ) - if synonymized: - matches.append(new_alias) - - if len(matches) > 0: - if qualifier is None or qualifier == "": - qualifier = DEFAULT_QUALIFIER - terms_to_synonymize[curie] = matches - change = kgcl.NewSynonym( - id="kgcl_change_id_" + str(len(terms_to_synonymize)), - about_node=curie, - old_value=alias, - new_value=new_alias, - qualifier=qualifier, - ) - change_list.append(change) - writer.emit(change) + curie_iter = query_terms_iterator(terms, impl) + for change in synonymizer.apply_synonymizer_to_terms( + impl, curie_iter, ruleset, include_all=True + ): + change_list.append(change) + writer.emit(change) + writer.finish() if apply_patch and len(change_list) > 0: if output: diff --git a/src/oaklib/datamodels/mapping_rules_datamodel.py b/src/oaklib/datamodels/mapping_rules_datamodel.py index 20b33d294..9dc5fe8de 100644 --- a/src/oaklib/datamodels/mapping_rules_datamodel.py +++ b/src/oaklib/datamodels/mapping_rules_datamodel.py @@ -7,39 +7,33 @@ # license: https://creativecommons.org/publicdomain/zero/1.0/ import dataclasses -import re -import sys from dataclasses import dataclass from typing import Any, ClassVar, Dict, List, Optional, Union -from jsonasobj2 import JsonObj, as_dict +from jsonasobj2 import as_dict from linkml_runtime.linkml_model.meta import ( EnumDefinition, PermissibleValue, - PvFormulaOptions, ) -from linkml_runtime.linkml_model.types import Boolean, Float, String, Uriorcurie from linkml_runtime.utils.curienamespace import CurieNamespace from linkml_runtime.utils.dataclass_extensions_376 import ( dataclasses_init_fn_with_kwargs, ) from linkml_runtime.utils.enumerations import EnumDefinitionImpl -from linkml_runtime.utils.formatutils import camelcase, sfx, underscore from linkml_runtime.utils.metamodelcore import ( Bool, URIorCURIE, - bnode, empty_dict, empty_list, ) from linkml_runtime.utils.slot import Slot from linkml_runtime.utils.yamlutils import ( YAMLRoot, - extended_float, - extended_int, extended_str, ) -from rdflib import Namespace, URIRef +from rdflib import URIRef + +from oaklib.datamodels.synonymizer_datamodel import Synonymizer, Test metamodel_version = "1.7.0" version = None @@ -253,74 +247,6 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): super().__post_init__(**kwargs) -@dataclass -class Synonymizer(YAMLRoot): - _inherited_slots: ClassVar[List[str]] = [] - - class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer - class_class_curie: ClassVar[str] = "mappingrules:Synonymizer" - class_name: ClassVar[str] = "Synonymizer" - class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer - - the_rule: Optional[str] = None - match: Optional[str] = None - match_scope: Optional[str] = None - replacement: Optional[str] = None - qualifier: Optional[str] = None - prefix: Optional[str] = None - tests: Optional[Union[dict, "Test"]] = None - - def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): - if self.the_rule is not None and not isinstance(self.the_rule, str): - self.the_rule = str(self.the_rule) - - if self.match is not None and not isinstance(self.match, str): - self.match = str(self.match) - - if self.match_scope is not None and not isinstance(self.match_scope, str): - self.match_scope = str(self.match_scope) - - if self.replacement is not None and not isinstance(self.replacement, str): - self.replacement = str(self.replacement) - - if self.qualifier is not None and not isinstance(self.qualifier, str): - self.qualifier = str(self.qualifier) - - if self.prefix is not None and not isinstance(self.prefix, str): - self.prefix = str(self.prefix) - - if self.tests is not None and not isinstance(self.tests, Test): - self.tests = Test(**as_dict(self.tests)) - - super().__post_init__(**kwargs) - - -@dataclass -class Test(YAMLRoot): - _inherited_slots: ClassVar[List[str]] = [] - - class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Test - class_class_curie: ClassVar[str] = "mappingrules:Test" - class_name: ClassVar[str] = "Test" - class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Test - - input: Optional[str] = None - output: Optional[str] = None - prefix: Optional[str] = None - - def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): - if self.input is not None and not isinstance(self.input, str): - self.input = str(self.input) - - if self.output is not None and not isinstance(self.output, str): - self.output = str(self.output) - - if self.prefix is not None and not isinstance(self.prefix, str): - self.prefix = str(self.prefix) - - super().__post_init__(**kwargs) - - @dataclass class LexicalIndex(YAMLRoot): """ diff --git a/src/oaklib/datamodels/mapping_rules_datamodel.yaml b/src/oaklib/datamodels/mapping_rules_datamodel.yaml index 39d683388..5dbd7e68b 100644 --- a/src/oaklib/datamodels/mapping_rules_datamodel.yaml +++ b/src/oaklib/datamodels/mapping_rules_datamodel.yaml @@ -2,7 +2,7 @@ id: https://w3id.org/oak/mapping-rules-datamodel title: Mapping Rules Datamodel name: mapping-rules-datamodel description: >- - A datamodel for specifying lexical mapping rules. + A datamodel for specifying lexical mapping rules license: https://creativecommons.org/publicdomain/zero/1.0/ prefixes: @@ -30,6 +30,7 @@ emit_prefixes: imports: - linkml:types - lexical_index + - synonymizer_datamodel @@ -86,7 +87,6 @@ classes: predicate_id_one_of: multivalued: true - Postcondition: attributes: predicate_id: @@ -101,7 +101,7 @@ classes: Synonymizer: attributes: - the_rule: + description: description: Description of the rule. range: string match: diff --git a/src/oaklib/datamodels/synonymizer_datamodel.py b/src/oaklib/datamodels/synonymizer_datamodel.py new file mode 100644 index 000000000..5149caad7 --- /dev/null +++ b/src/oaklib/datamodels/synonymizer_datamodel.py @@ -0,0 +1,697 @@ +# Auto generated from synonymizer_datamodel.yaml by pythongen.py version: 0.0.1 +# Generation date: 2024-03-27T14:21:27 +# Schema: synonymizer_datamodel +# +# id: https://w3id.org/oak/synonymizer-datamodel +# description: A datamodel for specifying synonymization rules +# license: https://creativecommons.org/publicdomain/zero/1.0/ + +import dataclasses +import re +from jsonasobj2 import JsonObj, as_dict +from typing import Optional, List, Union, Dict, ClassVar, Any +from dataclasses import dataclass +from linkml_runtime.linkml_model.meta import EnumDefinition, PermissibleValue, PvFormulaOptions + +from linkml_runtime.utils.slot import Slot +from linkml_runtime.utils.metamodelcore import empty_list, empty_dict, bnode +from linkml_runtime.utils.yamlutils import YAMLRoot, extended_str, extended_float, extended_int +from linkml_runtime.utils.dataclass_extensions_376 import dataclasses_init_fn_with_kwargs +from linkml_runtime.utils.formatutils import camelcase, underscore, sfx +from linkml_runtime.utils.enumerations import EnumDefinitionImpl +from rdflib import Namespace, URIRef +from linkml_runtime.utils.curienamespace import CurieNamespace +from linkml_runtime.linkml_model.types import Boolean, String, Uriorcurie +from linkml_runtime.utils.metamodelcore import Bool, URIorCURIE + +metamodel_version = "1.7.0" +version = None + +# Overwrite dataclasses _init_fn to add **kwargs in __init__ +dataclasses._init_fn = dataclasses_init_fn_with_kwargs + +# Namespaces +LINKML = CurieNamespace("linkml", "https://w3id.org/linkml/") +ONTOLEXINDEX = CurieNamespace("ontolexindex", "https://w3id.org/oak/lexical-index/") +OWL = CurieNamespace("owl", "http://www.w3.org/2002/07/owl#") +PAV = CurieNamespace("pav", "http://purl.org/pav/") +PROV = CurieNamespace("prov", "http://www.w3.org/ns/prov#") +RDF = CurieNamespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") +RDFS = CurieNamespace("rdfs", "http://www.w3.org/2000/01/rdf-schema#") +SCHEMA = CurieNamespace("schema", "http://schema.org/") +SH = CurieNamespace("sh", "https://w3id.org/shacl/") +SKOS = CurieNamespace("skos", "http://www.w3.org/2004/02/skos/core#") +SYNONYMIZER = CurieNamespace("synonymizer", "https://w3id.org/oak/synonymizer-datamodel/") +XSD = CurieNamespace("xsd", "http://www.w3.org/2001/XMLSchema#") +DEFAULT_ = SYNONYMIZER + + +# Types +class RegularExpressionString(String): + type_class_uri = XSD["string"] + type_class_curie = "xsd:string" + type_name = "RegularExpressionString" + type_model_uri = SYNONYMIZER.RegularExpressionString + + +# Class references +class LexicalGroupingTerm(extended_str): + pass + + +class LexicalTransformationPipelineName(extended_str): + pass + + +@dataclass +class RuleSet(YAMLRoot): + """ + A set of rules for generating synonyms or alternate lexical elements. + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = SYNONYMIZER["RuleSet"] + class_class_curie: ClassVar[str] = "synonymizer:RuleSet" + class_name: ClassVar[str] = "RuleSet" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.RuleSet + + rules: Optional[Union[Union[dict, "Synonymizer"], List[Union[dict, "Synonymizer"]]]] = ( + empty_list() + ) + prefix: Optional[str] = None + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if not isinstance(self.rules, list): + self.rules = [self.rules] if self.rules is not None else [] + self.rules = [ + v if isinstance(v, Synonymizer) else Synonymizer(**as_dict(v)) for v in self.rules + ] + + if self.prefix is not None and not isinstance(self.prefix, str): + self.prefix = str(self.prefix) + + super().__post_init__(**kwargs) + + +@dataclass +class Synonymizer(YAMLRoot): + """ + Specification of a rule for generating a synonym or alternate lexical element. + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = SYNONYMIZER["Synonymizer"] + class_class_curie: ClassVar[str] = "synonymizer:Synonymizer" + class_name: ClassVar[str] = "Synonymizer" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.Synonymizer + + description: Optional[str] = None + match: Optional[Union[str, RegularExpressionString]] = None + match_scope: Optional[str] = None + replacement: Optional[Union[str, RegularExpressionString]] = None + qualifier: Optional[str] = None + prefix: Optional[str] = None + in_place: Optional[Union[bool, Bool]] = None + tests: Optional[Union[Union[dict, "Test"], List[Union[dict, "Test"]]]] = empty_list() + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self.description is not None and not isinstance(self.description, str): + self.description = str(self.description) + + if self.match is not None and not isinstance(self.match, RegularExpressionString): + self.match = RegularExpressionString(self.match) + + if self.match_scope is not None and not isinstance(self.match_scope, str): + self.match_scope = str(self.match_scope) + + if self.replacement is not None and not isinstance( + self.replacement, RegularExpressionString + ): + self.replacement = RegularExpressionString(self.replacement) + + if self.qualifier is not None and not isinstance(self.qualifier, str): + self.qualifier = str(self.qualifier) + + if self.prefix is not None and not isinstance(self.prefix, str): + self.prefix = str(self.prefix) + + if self.in_place is not None and not isinstance(self.in_place, Bool): + self.in_place = Bool(self.in_place) + + if not isinstance(self.tests, list): + self.tests = [self.tests] if self.tests is not None else [] + self.tests = [v if isinstance(v, Test) else Test(**as_dict(v)) for v in self.tests] + + super().__post_init__(**kwargs) + + +@dataclass +class Test(YAMLRoot): + """ + A unit test for a rule, specifies an intended output for an input + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = SYNONYMIZER["Test"] + class_class_curie: ClassVar[str] = "synonymizer:Test" + class_name: ClassVar[str] = "Test" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.Test + + input: Optional[str] = None + output: Optional[str] = None + prefix: Optional[str] = None + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self.input is not None and not isinstance(self.input, str): + self.input = str(self.input) + + if self.output is not None and not isinstance(self.output, str): + self.output = str(self.output) + + if self.prefix is not None and not isinstance(self.prefix, str): + self.prefix = str(self.prefix) + + super().__post_init__(**kwargs) + + +@dataclass +class LexicalIndex(YAMLRoot): + """ + An index over an ontology keyed by lexical unit + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = ONTOLEXINDEX["LexicalIndex"] + class_class_curie: ClassVar[str] = "ontolexindex:LexicalIndex" + class_name: ClassVar[str] = "LexicalIndex" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.LexicalIndex + + groupings: Optional[ + Union[ + Dict[Union[str, LexicalGroupingTerm], Union[dict, "LexicalGrouping"]], + List[Union[dict, "LexicalGrouping"]], + ] + ] = empty_dict() + pipelines: Optional[ + Union[ + Dict[ + Union[str, LexicalTransformationPipelineName], + Union[dict, "LexicalTransformationPipeline"], + ], + List[Union[dict, "LexicalTransformationPipeline"]], + ] + ] = empty_dict() + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + self._normalize_inlined_as_dict( + slot_name="groupings", slot_type=LexicalGrouping, key_name="term", keyed=True + ) + + self._normalize_inlined_as_dict( + slot_name="pipelines", + slot_type=LexicalTransformationPipeline, + key_name="name", + keyed=True, + ) + + super().__post_init__(**kwargs) + + +@dataclass +class LexicalGrouping(YAMLRoot): + """ + A grouping of ontology elements by a shared lexical term + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = ONTOLEXINDEX["LexicalGrouping"] + class_class_curie: ClassVar[str] = "ontolexindex:LexicalGrouping" + class_name: ClassVar[str] = "LexicalGrouping" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.LexicalGrouping + + term: Union[str, LexicalGroupingTerm] = None + relationships: Optional[ + Union[Union[dict, "RelationshipToTerm"], List[Union[dict, "RelationshipToTerm"]]] + ] = empty_list() + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self._is_empty(self.term): + self.MissingRequiredField("term") + if not isinstance(self.term, LexicalGroupingTerm): + self.term = LexicalGroupingTerm(self.term) + + if not isinstance(self.relationships, list): + self.relationships = [self.relationships] if self.relationships is not None else [] + self.relationships = [ + v if isinstance(v, RelationshipToTerm) else RelationshipToTerm(**as_dict(v)) + for v in self.relationships + ] + + super().__post_init__(**kwargs) + + +@dataclass +class RelationshipToTerm(YAMLRoot): + """ + A relationship of an ontology element to a lexical term + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = ONTOLEXINDEX["RelationshipToTerm"] + class_class_curie: ClassVar[str] = "ontolexindex:RelationshipToTerm" + class_name: ClassVar[str] = "RelationshipToTerm" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.RelationshipToTerm + + predicate: Optional[Union[str, URIorCURIE]] = None + element: Optional[Union[str, URIorCURIE]] = None + element_term: Optional[str] = None + source: Optional[Union[str, URIorCURIE]] = None + pipeline: Optional[ + Union[ + Union[str, LexicalTransformationPipelineName], + List[Union[str, LexicalTransformationPipelineName]], + ] + ] = empty_list() + synonymized: Optional[Union[bool, Bool]] = None + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self.predicate is not None and not isinstance(self.predicate, URIorCURIE): + self.predicate = URIorCURIE(self.predicate) + + if self.element is not None and not isinstance(self.element, URIorCURIE): + self.element = URIorCURIE(self.element) + + if self.element_term is not None and not isinstance(self.element_term, str): + self.element_term = str(self.element_term) + + if self.source is not None and not isinstance(self.source, URIorCURIE): + self.source = URIorCURIE(self.source) + + if not isinstance(self.pipeline, list): + self.pipeline = [self.pipeline] if self.pipeline is not None else [] + self.pipeline = [ + ( + v + if isinstance(v, LexicalTransformationPipelineName) + else LexicalTransformationPipelineName(v) + ) + for v in self.pipeline + ] + + if self.synonymized is not None and not isinstance(self.synonymized, Bool): + self.synonymized = Bool(self.synonymized) + + super().__post_init__(**kwargs) + + +class Activity(YAMLRoot): + """ + Generic grouping for any lexical operation + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = PROV["Activity"] + class_class_curie: ClassVar[str] = "prov:Activity" + class_name: ClassVar[str] = "Activity" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.Activity + + +@dataclass +class LexicalTransformationPipeline(Activity): + """ + A collection of atomic lexical transformations that are applied in serial fashion + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = ONTOLEXINDEX["LexicalTransformationPipeline"] + class_class_curie: ClassVar[str] = "ontolexindex:LexicalTransformationPipeline" + class_name: ClassVar[str] = "LexicalTransformationPipeline" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.LexicalTransformationPipeline + + name: Union[str, LexicalTransformationPipelineName] = None + transformations: Optional[ + Union[Union[dict, "LexicalTransformation"], List[Union[dict, "LexicalTransformation"]]] + ] = empty_list() + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self._is_empty(self.name): + self.MissingRequiredField("name") + if not isinstance(self.name, LexicalTransformationPipelineName): + self.name = LexicalTransformationPipelineName(self.name) + + if not isinstance(self.transformations, list): + self.transformations = ( + [self.transformations] if self.transformations is not None else [] + ) + self.transformations = [ + v if isinstance(v, LexicalTransformation) else LexicalTransformation(**as_dict(v)) + for v in self.transformations + ] + + super().__post_init__(**kwargs) + + +@dataclass +class LexicalTransformation(Activity): + """ + An atomic lexical transformation applied on a term (string) yielding a transformed string + """ + + _inherited_slots: ClassVar[List[str]] = [] + + class_class_uri: ClassVar[URIRef] = ONTOLEXINDEX["LexicalTransformation"] + class_class_curie: ClassVar[str] = "ontolexindex:LexicalTransformation" + class_name: ClassVar[str] = "LexicalTransformation" + class_model_uri: ClassVar[URIRef] = SYNONYMIZER.LexicalTransformation + + type: Optional[Union[str, "TransformationType"]] = None + params: Optional[Union[Union[dict, "Any"], List[Union[dict, "Any"]]]] = empty_list() + + def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): + if self.type is not None and not isinstance(self.type, TransformationType): + self.type = TransformationType(self.type) + + super().__post_init__(**kwargs) + + +Any = Any + + +# Enumerations +class TransformationType(EnumDefinitionImpl): + """ + A controlled datamodels of the types of transformation that can be applied to + """ + + Stemming = PermissibleValue( + text="Stemming", + description="Removal of the last few characters of a word to yield a stem term for each word in the term", + ) + Lemmatization = PermissibleValue( + text="Lemmatization", + description="Contextual reduction of a word to its base form for each word in the term", + ) + WordOrderNormalization = PermissibleValue( + text="WordOrderNormalization", + description="reorder words in the term to a standard order such that comparisons are order-independent", + ) + Depluralization = PermissibleValue( + text="Depluralization", + description="Transform plural form to singular form for each word in a term", + ) + CaseNormalization = PermissibleValue( + text="CaseNormalization", + description="Transform term to a standard case, typically lowercase", + ) + WhitespaceNormalization = PermissibleValue( + text="WhitespaceNormalization", + description="Trim whitespace, condense whitespace runs, and transform all non-space whitespace to spaces", + ) + TermExpanson = PermissibleValue( + text="TermExpanson", description="Expand terms using a dictionary" + ) + Synonymization = PermissibleValue( + text="Synonymization", description="Applying synonymizer rules from matcher_rules.yaml" + ) + + _defn = EnumDefinition( + name="TransformationType", + description="A controlled datamodels of the types of transformation that can be applied to", + ) + + +# Slots +class slots: + pass + + +slots.ruleSet__rules = Slot( + uri=SYNONYMIZER.rules, + name="ruleSet__rules", + curie=SYNONYMIZER.curie("rules"), + model_uri=SYNONYMIZER.ruleSet__rules, + domain=None, + range=Optional[Union[Union[dict, Synonymizer], List[Union[dict, Synonymizer]]]], +) + +slots.ruleSet__prefix = Slot( + uri=SYNONYMIZER.prefix, + name="ruleSet__prefix", + curie=SYNONYMIZER.curie("prefix"), + model_uri=SYNONYMIZER.ruleSet__prefix, + domain=None, + range=Optional[str], +) + +slots.synonymizer__description = Slot( + uri=SYNONYMIZER.description, + name="synonymizer__description", + curie=SYNONYMIZER.curie("description"), + model_uri=SYNONYMIZER.synonymizer__description, + domain=None, + range=Optional[str], +) + +slots.synonymizer__match = Slot( + uri=SYNONYMIZER.match, + name="synonymizer__match", + curie=SYNONYMIZER.curie("match"), + model_uri=SYNONYMIZER.synonymizer__match, + domain=None, + range=Optional[Union[str, RegularExpressionString]], +) + +slots.synonymizer__match_scope = Slot( + uri=SYNONYMIZER.match_scope, + name="synonymizer__match_scope", + curie=SYNONYMIZER.curie("match_scope"), + model_uri=SYNONYMIZER.synonymizer__match_scope, + domain=None, + range=Optional[str], +) + +slots.synonymizer__replacement = Slot( + uri=SYNONYMIZER.replacement, + name="synonymizer__replacement", + curie=SYNONYMIZER.curie("replacement"), + model_uri=SYNONYMIZER.synonymizer__replacement, + domain=None, + range=Optional[Union[str, RegularExpressionString]], +) + +slots.synonymizer__qualifier = Slot( + uri=SYNONYMIZER.qualifier, + name="synonymizer__qualifier", + curie=SYNONYMIZER.curie("qualifier"), + model_uri=SYNONYMIZER.synonymizer__qualifier, + domain=None, + range=Optional[str], +) + +slots.synonymizer__prefix = Slot( + uri=SYNONYMIZER.prefix, + name="synonymizer__prefix", + curie=SYNONYMIZER.curie("prefix"), + model_uri=SYNONYMIZER.synonymizer__prefix, + domain=None, + range=Optional[str], +) + +slots.synonymizer__in_place = Slot( + uri=SYNONYMIZER.in_place, + name="synonymizer__in_place", + curie=SYNONYMIZER.curie("in_place"), + model_uri=SYNONYMIZER.synonymizer__in_place, + domain=None, + range=Optional[Union[bool, Bool]], +) + +slots.synonymizer__tests = Slot( + uri=SYNONYMIZER.tests, + name="synonymizer__tests", + curie=SYNONYMIZER.curie("tests"), + model_uri=SYNONYMIZER.synonymizer__tests, + domain=None, + range=Optional[Union[Union[dict, Test], List[Union[dict, Test]]]], +) + +slots.test__input = Slot( + uri=SYNONYMIZER.input, + name="test__input", + curie=SYNONYMIZER.curie("input"), + model_uri=SYNONYMIZER.test__input, + domain=None, + range=Optional[str], +) + +slots.test__output = Slot( + uri=SYNONYMIZER.output, + name="test__output", + curie=SYNONYMIZER.curie("output"), + model_uri=SYNONYMIZER.test__output, + domain=None, + range=Optional[str], +) + +slots.test__prefix = Slot( + uri=SYNONYMIZER.prefix, + name="test__prefix", + curie=SYNONYMIZER.curie("prefix"), + model_uri=SYNONYMIZER.test__prefix, + domain=None, + range=Optional[str], +) + +slots.lexicalIndex__groupings = Slot( + uri=ONTOLEXINDEX.groupings, + name="lexicalIndex__groupings", + curie=ONTOLEXINDEX.curie("groupings"), + model_uri=SYNONYMIZER.lexicalIndex__groupings, + domain=None, + range=Optional[ + Union[ + Dict[Union[str, LexicalGroupingTerm], Union[dict, LexicalGrouping]], + List[Union[dict, LexicalGrouping]], + ] + ], +) + +slots.lexicalIndex__pipelines = Slot( + uri=ONTOLEXINDEX.pipelines, + name="lexicalIndex__pipelines", + curie=ONTOLEXINDEX.curie("pipelines"), + model_uri=SYNONYMIZER.lexicalIndex__pipelines, + domain=None, + range=Optional[ + Union[ + Dict[ + Union[str, LexicalTransformationPipelineName], + Union[dict, LexicalTransformationPipeline], + ], + List[Union[dict, LexicalTransformationPipeline]], + ] + ], +) + +slots.lexicalGrouping__term = Slot( + uri=ONTOLEXINDEX.term, + name="lexicalGrouping__term", + curie=ONTOLEXINDEX.curie("term"), + model_uri=SYNONYMIZER.lexicalGrouping__term, + domain=None, + range=URIRef, +) + +slots.lexicalGrouping__relationships = Slot( + uri=ONTOLEXINDEX.relationships, + name="lexicalGrouping__relationships", + curie=ONTOLEXINDEX.curie("relationships"), + model_uri=SYNONYMIZER.lexicalGrouping__relationships, + domain=None, + range=Optional[Union[Union[dict, RelationshipToTerm], List[Union[dict, RelationshipToTerm]]]], +) + +slots.relationshipToTerm__predicate = Slot( + uri=ONTOLEXINDEX.predicate, + name="relationshipToTerm__predicate", + curie=ONTOLEXINDEX.curie("predicate"), + model_uri=SYNONYMIZER.relationshipToTerm__predicate, + domain=None, + range=Optional[Union[str, URIorCURIE]], +) + +slots.relationshipToTerm__element = Slot( + uri=ONTOLEXINDEX.element, + name="relationshipToTerm__element", + curie=ONTOLEXINDEX.curie("element"), + model_uri=SYNONYMIZER.relationshipToTerm__element, + domain=None, + range=Optional[Union[str, URIorCURIE]], +) + +slots.relationshipToTerm__element_term = Slot( + uri=ONTOLEXINDEX.element_term, + name="relationshipToTerm__element_term", + curie=ONTOLEXINDEX.curie("element_term"), + model_uri=SYNONYMIZER.relationshipToTerm__element_term, + domain=None, + range=Optional[str], +) + +slots.relationshipToTerm__source = Slot( + uri=ONTOLEXINDEX.source, + name="relationshipToTerm__source", + curie=ONTOLEXINDEX.curie("source"), + model_uri=SYNONYMIZER.relationshipToTerm__source, + domain=None, + range=Optional[Union[str, URIorCURIE]], +) + +slots.relationshipToTerm__pipeline = Slot( + uri=ONTOLEXINDEX.pipeline, + name="relationshipToTerm__pipeline", + curie=ONTOLEXINDEX.curie("pipeline"), + model_uri=SYNONYMIZER.relationshipToTerm__pipeline, + domain=None, + range=Optional[ + Union[ + Union[str, LexicalTransformationPipelineName], + List[Union[str, LexicalTransformationPipelineName]], + ] + ], +) + +slots.relationshipToTerm__synonymized = Slot( + uri=ONTOLEXINDEX.synonymized, + name="relationshipToTerm__synonymized", + curie=ONTOLEXINDEX.curie("synonymized"), + model_uri=SYNONYMIZER.relationshipToTerm__synonymized, + domain=None, + range=Optional[Union[bool, Bool]], +) + +slots.lexicalTransformationPipeline__name = Slot( + uri=ONTOLEXINDEX.name, + name="lexicalTransformationPipeline__name", + curie=ONTOLEXINDEX.curie("name"), + model_uri=SYNONYMIZER.lexicalTransformationPipeline__name, + domain=None, + range=URIRef, +) + +slots.lexicalTransformationPipeline__transformations = Slot( + uri=ONTOLEXINDEX.transformations, + name="lexicalTransformationPipeline__transformations", + curie=ONTOLEXINDEX.curie("transformations"), + model_uri=SYNONYMIZER.lexicalTransformationPipeline__transformations, + domain=None, + range=Optional[ + Union[Union[dict, LexicalTransformation], List[Union[dict, LexicalTransformation]]] + ], +) + +slots.lexicalTransformation__type = Slot( + uri=ONTOLEXINDEX.type, + name="lexicalTransformation__type", + curie=ONTOLEXINDEX.curie("type"), + model_uri=SYNONYMIZER.lexicalTransformation__type, + domain=None, + range=Optional[Union[str, "TransformationType"]], +) + +slots.lexicalTransformation__params = Slot( + uri=ONTOLEXINDEX.params, + name="lexicalTransformation__params", + curie=ONTOLEXINDEX.curie("params"), + model_uri=SYNONYMIZER.lexicalTransformation__params, + domain=None, + range=Optional[Union[Union[dict, Any], List[Union[dict, Any]]]], +) diff --git a/src/oaklib/datamodels/synonymizer_datamodel.yaml b/src/oaklib/datamodels/synonymizer_datamodel.yaml new file mode 100644 index 000000000..4a53f2a64 --- /dev/null +++ b/src/oaklib/datamodels/synonymizer_datamodel.yaml @@ -0,0 +1,94 @@ +id: https://w3id.org/oak/synonymizer-datamodel +title: Synonymizer Datamodel +name: synonymizer_datamodel +description: >- + A datamodel for specifying synonymization rules +license: https://creativecommons.org/publicdomain/zero/1.0/ + +prefixes: + linkml: https://w3id.org/linkml/ + synonymizer: https://w3id.org/oak/synonymizer-datamodel/ + skos: http://www.w3.org/2004/02/skos/core# + pav: http://purl.org/pav/ + schema: http://schema.org/ + sh: https://w3id.org/shacl/ + prov: http://www.w3.org/ns/prov# + +default_prefix: synonymizer +default_range: string + +default_curi_maps: + - semweb_context + +emit_prefixes: + - linkml + - rdf + - rdfs + - xsd + - owl + +imports: + - linkml:types + - lexical_index + + +types: + RegularExpressionString: + typeof: string + + +#================================== +# Classes # +#================================== +classes: + + RuleSet: + description: A set of rules for generating synonyms or alternate lexical elements. + attributes: + rules: + description: A list of rules for generating synonyms or alternate lexical elements. + range: Synonymizer + multivalued: true + prefix: + description: The prefix that qualifies for the rule. + range: string + + Synonymizer: + description: Specification of a rule for generating a synonym or alternate lexical element. + attributes: + description: + description: Description of the rule. + range: string + match: + description: Reg-ex rule to match substrings in labels. + range: RegularExpressionString + match_scope: + description: Synonym scope of the reg-ex rule, e.g. exact, narrow + range: string + replacement: + description: Reg-ex rule to replace substrings in labels + range: RegularExpressionString + qualifier: + description: Type of match for the new synonym generated. + range: string + prefix: + description: The rule applies to nodes of a specific prefix. + range: string + in_place: + description: Whether the rule is applied in place or not. + range: boolean + tests: + description: Unit tests for each rules. + range: Test + multivalued: true + + Test: + description: A unit test for a rule, specifies an intended output for an input + attributes: + input: + description: Input string for the rule. + output: + description: Output based on the rule. + prefix: + description: The prefix that qualifies for the rule. + \ No newline at end of file diff --git a/src/oaklib/datamodels/vocabulary.py b/src/oaklib/datamodels/vocabulary.py index 72ffbc5c0..8dec4aa3f 100644 --- a/src/oaklib/datamodels/vocabulary.py +++ b/src/oaklib/datamodels/vocabulary.py @@ -41,6 +41,7 @@ } SYNONYM_PRED_TO_SCOPE_MAP = {v: k for k, v in SCOPE_TO_SYNONYM_PRED_MAP.items()} + DEPRECATED_PREDICATE = omd.slots.deprecated.curie TERM_REPLACED_BY = omd.slots.term_replaced_by.curie CONSIDER_REPLACEMENT = omd.slots.consider.curie @@ -176,6 +177,12 @@ NODE_DELETION = "NodeDeletion" NODE_TEXT_DEFINITION_CHANGE = "NodeTextDefinitionChange" +EXTENDED_SCOPE_TO_SYNONYM_PRED_MAP = { + "LABEL": LABEL_PREDICATE, + "DEFINITION": HAS_DEFINITION_CURIE, + **SYNONYM_PRED_TO_SCOPE_MAP, +} + class SEMAPV(Enum): """SEMAPV Enum containing different mapping_justification.""" diff --git a/src/oaklib/utilities/kgcl_utilities.py b/src/oaklib/utilities/kgcl_utilities.py index 9ac74ad89..bbb4f916a 100644 --- a/src/oaklib/utilities/kgcl_utilities.py +++ b/src/oaklib/utilities/kgcl_utilities.py @@ -103,6 +103,8 @@ def tidy_change_object(change: kgcl.Change): Sometimes the main kgcl parser will leave quotes in place, URIs quoted, etc. As these are fixed in the main KCGL repo we can remove these here. + See ``_ for more information. + :param change: :return: """ diff --git a/src/oaklib/utilities/lexical/lexical_indexer.py b/src/oaklib/utilities/lexical/lexical_indexer.py index c535a94af..96ce7cbaf 100644 --- a/src/oaklib/utilities/lexical/lexical_indexer.py +++ b/src/oaklib/utilities/lexical/lexical_indexer.py @@ -34,8 +34,8 @@ from oaklib.datamodels.mapping_rules_datamodel import ( MappingRuleCollection, Precondition, - Synonymizer, ) +from oaklib.datamodels.synonymizer_datamodel import Synonymizer from oaklib.datamodels.vocabulary import ( IDENTIFIER_PREDICATE, SEMAPV, @@ -47,6 +47,7 @@ from oaklib.interfaces import BasicOntologyInterface from oaklib.types import CURIE, PRED_CURIE from oaklib.utilities.basic_utils import pairs_as_dict +from oaklib.utilities.lexical.synonymizer import apply_synonymizer LEXICAL_INDEX_FORMATS = ["yaml", "json"] DEFAULT_QUALIFIER = "exact" @@ -470,7 +471,7 @@ def precondition_holds(precondition: Precondition, mapping: Mapping) -> bool: def apply_transformation( term: str, transformation: LexicalTransformation -) -> Union[str, List[Tuple[bool, str, str]]]: +) -> Union[str, Tuple[bool, str, str]]: """ Apply an individual transformation on a term @@ -497,35 +498,6 @@ def apply_transformation( ) -def apply_synonymizer(term: str, rules: List[Synonymizer]) -> Tuple[bool, str, str]: - """ - Apply synonymizer rules declared in the given match-rules.yaml file. - - The basic concept is looking for regex in labels and replacing the ones that match - with the string passed in 'match.replacement'. Also set qualifier ('match.qualifier') - as to whether the replacement is an 'exact', 'broad', 'narrow', or 'related' synonym. - - Note: This function "yields" all intermediate results (for each rule applied) - as opposed to a final result. The reason being we only want to return a "True" - synonymized result. If the term is not synonymized, then the result will be just - the term and a default qualifier. In the case of multiple synonyms, the actual result - will be the latest synonymized result.In other words, all the rules have been - implemented on the term to finally produce the result. - - :param term: Original label. - :param rules: Synonymizer rules from match-rules.yaml file. - :yield: A Tuple stating [if the label changed, new label, qualifier] - """ - for rule in rules: - tmp_term_2 = term - term = re.sub(rule.match, rule.replacement, term) - - if tmp_term_2 != term: - yield True, term.strip(), rule.qualifier - else: - yield False, term.strip(), rule.qualifier - - def save_mapping_rules(mapping_rules: MappingRuleCollection, path: str): """ Saves a YAML using standard mapping of datanodel to YAML diff --git a/src/oaklib/utilities/lexical/synonymizer.py b/src/oaklib/utilities/lexical/synonymizer.py new file mode 100644 index 000000000..72e3553e6 --- /dev/null +++ b/src/oaklib/utilities/lexical/synonymizer.py @@ -0,0 +1,145 @@ +import re +from typing import Iterable, Iterator, List, Optional, Tuple + +from kgcl_schema.datamodel import kgcl + +from oaklib.datamodels.synonymizer_datamodel import RuleSet, Synonymizer +from oaklib.datamodels.vocabulary import ( + EXTENDED_SCOPE_TO_SYNONYM_PRED_MAP, +) +from oaklib.interfaces import BasicOntologyInterface +from oaklib.types import CURIE + + +def apply_synonymizer( + term: str, rules: List[Synonymizer], scope_predicate: Optional[CURIE] = None +) -> Iterator[Tuple[bool, str, str]]: + """ + Apply synonymizer rules declared in the given match-rules.yaml file. + + The basic concept is looking for regex in labels and replacing the ones that match + with the string passed in 'match.replacement'. Also set qualifier ('match.qualifier') + as to whether the replacement is an 'exact', 'broad', 'narrow', or 'related' synonym. + + Note: This function yields all intermediate results (for each rule applied) + as opposed to a final result. The reason being we only want to return a "True" + synonymized result. If the term is not synonymized, then the result will be just + the term and a default qualifier. In the case of multiple synonyms, the actual result + will be the latest synonymized result.In other words, all the rules have been + implemented on the term to finally produce the result. + + :param term: Original label. + :param rules: Synonymizer rules from match-rules.yaml file. + :yield: A Tuple stating [if the label changed, new label, qualifier] + """ + for rule in rules: + if not scope_matches(rule, scope_predicate): + continue + tmp_term_2 = term + term = re.sub(rule.match, rule.replacement, term) + + if tmp_term_2 != term: + yield True, term.strip(), rule.qualifier + else: + yield False, term.strip(), rule.qualifier + + +def apply_synonymizer_to_terms( + adapter: BasicOntologyInterface, + terms: Iterable[CURIE], + ruleset: RuleSet, + include_all=False, +) -> Iterator[kgcl.NewSynonym]: + """ + Apply synonymizer rules to a list of terms. + + :param adapter: + :param terms: + :param ruleset: + :param include_all: + :return: + """ + n = 0 + for curie in terms: + tvs = list(adapter.entity_alias_map(curie).items()) + if include_all: + defn = adapter.definition(curie) + if defn: + tvs.append(("definition", [defn])) + for scope_pred, aliases in tvs: + if aliases is not None: + for alias in aliases: + if alias: + for rule in ruleset.rules: + for replaced, new_alias, qualifier in apply_synonymizer( + alias, [rule], scope_pred + ): + if replaced: + if qualifier is None or qualifier == "": + qualifier = "exact" + n += 1 + change_id = f"kgcl_change_id_{n}" + if qualifier == "label": + change = kgcl.NodeRename( + id=change_id, + about_node=curie, + old_value=alias, + new_value=new_alias, + ) + elif qualifier == "definition": + change = kgcl.NodeTextDefinitionChange( + id=change_id, + about_node=curie, + old_value=alias, + new_value=new_alias, + ) + else: + if rule.in_place: + change = kgcl.SynonymReplacement( + id=change_id, + about_node=curie, + old_value=alias, + new_value=new_alias, + ) + else: + change = kgcl.NewSynonym( + id=change_id, + about_node=curie, + old_value=alias, + new_value=new_alias, + qualifier=qualifier, + ) + yield change + + +def scope_matches(rule: Synonymizer, scope_predicate: Optional[CURIE]) -> bool: + """ + Check if the rule scope matches the scope_predicate. + + >>> scope_matches(Synonymizer(match_scope="EXACT"), "oio:hasExactSynonym") + True + >>> scope_matches(Synonymizer(match_scope="EXACT"), "oio:hasRelatedSynonym") + False + >>> scope_matches(Synonymizer(match_scope="*"), "oio:hasRelatedSynonym") + True + >>> scope_matches(Synonymizer(), "oio:hasExactSynonym") + True + + :param rule: Synonymizer rule. + :param scope_predicate: Scope predicate. + :return: True if the rule scope matches the scope_predicate. + """ + if scope_predicate is None: + return True + if rule.match_scope is None: + return True + if rule.match_scope == "*" or rule.match_scope == "": + return True + rule_match_scope = rule.match_scope.upper() + if rule_match_scope == scope_predicate.upper(): + return True + if rule_match_scope in EXTENDED_SCOPE_TO_SYNONYM_PRED_MAP: + rule_match_scope_predicate = EXTENDED_SCOPE_TO_SYNONYM_PRED_MAP[rule_match_scope] + if rule_match_scope_predicate == scope_predicate: + return True + return False diff --git a/tests/input/cli-synonymizer-rules.yaml b/tests/input/cli-synonymizer-rules.yaml new file mode 100644 index 000000000..f5c4093f6 --- /dev/null +++ b/tests/input/cli-synonymizer-rules.yaml @@ -0,0 +1,13 @@ +rules: + - description: Remove parentheses bound info from the label. + match: "\\([^)]*\\)" + match_scope: "*" + replacement: "" + + - description: Remove box brackets bound info from the label. + match: "\\[[^)]*\\]" + match_scope: "*" + replacement: "" + + + diff --git a/tests/input/matcher_rules.yaml b/tests/input/matcher_rules.yaml index 9ff4cb030..c47ce14bc 100644 --- a/tests/input/matcher_rules.yaml +++ b/tests/input/matcher_rules.yaml @@ -63,13 +63,13 @@ rules: weight: 2.0 - synonymizer: - the_rule: Remove parentheses bound info from the label. + description: Remove parentheses bound info from the label. match: "\\([^)]*\\)" match_scope: "*" replacement: "" - synonymizer: - the_rule: Remove box brackets bound info from the label. + description: Remove box brackets bound info from the label. match: "\\[[^)]*\\]" match_scope: "*" replacement: "" diff --git a/tests/test_cli.py b/tests/test_cli.py index dd31013e9..8a7eb1b81 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -62,6 +62,7 @@ TEST_SSSOM_MAPPING = INPUT_DIR / "unreciprocated-mapping-test.sssom.tsv" TEST_SYNONYMIZER_OBO = "simpleobo:" + str(INPUT_DIR / "synonym-test.obo") RULES_FILE = INPUT_DIR / "matcher_rules.yaml" +SYNONYMIZER_RULES_FILE = INPUT_DIR / "cli-synonymizer-rules.yaml" def _outpath(test: str, fmt: str = "tmp") -> str: @@ -1396,7 +1397,7 @@ def test_generate_synonyms_and_apply(self): TEST_SYNONYMIZER_OBO, "generate-synonyms", "-R", - RULES_FILE, + SYNONYMIZER_RULES_FILE, "--patch", patch_file, "--apply-patch", @@ -1423,7 +1424,7 @@ def test_generate_synonyms_no_apply(self): TEST_SYNONYMIZER_OBO, "generate-synonyms", "-R", - RULES_FILE, + SYNONYMIZER_RULES_FILE, "-o", patch_file, ".all", diff --git a/tests/test_utilities/test_lexical_index.py b/tests/test_utilities/test_lexical_index.py index 0184333d9..9882add13 100644 --- a/tests/test_utilities/test_lexical_index.py +++ b/tests/test_utilities/test_lexical_index.py @@ -5,7 +5,7 @@ LexicalTransformationPipeline, TransformationType, ) -from oaklib.datamodels.mapping_rules_datamodel import Synonymizer +from oaklib.datamodels.synonymizer_datamodel import Synonymizer from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation from oaklib.implementations.simpleobo.simple_obo_implementation import ( SimpleOboImplementation, @@ -56,19 +56,19 @@ def test_pipelines(self): builder.build() syn_param = [ Synonymizer( - the_rule="Remove parentheses bound info from the label.", + description="Remove parentheses bound info from the label.", match=r"\([^)]*\)", # noqa W605 match_scope="*", replacement="", ), Synonymizer( - the_rule="Remove box brackets bound info from the label.", + description="Remove box brackets bound info from the label.", match=r"\[[^)]*\]", # noqa W605 match_scope="*", replacement="", ), Synonymizer( - the_rule="Broad match terms with the term 'other' in them.", + description="Broad match terms with the term 'other' in them.", match=r"(?i)^Other ", # noqa W605 match_scope="*", replacement="", @@ -159,7 +159,7 @@ def test_synonymizer_with_other(self): oi = SimpleOboImplementation(resource) syn_param = [ Synonymizer( - the_rule="Broad match terms with the term 'other' in them.", + description="Broad match terms with the term 'other' in them.", match="(?i)^Other ", # noqa W605 match_scope="*", replacement="", diff --git a/tests/test_utilities/test_synonymizer.py b/tests/test_utilities/test_synonymizer.py new file mode 100644 index 000000000..4f30c5fd5 --- /dev/null +++ b/tests/test_utilities/test_synonymizer.py @@ -0,0 +1,99 @@ +import pytest +from kgcl_schema.grammar.render_operations import render +from oaklib import get_adapter +from oaklib.datamodels.synonymizer_datamodel import RuleSet, Synonymizer +from oaklib.utilities.lexical.synonymizer import apply_synonymizer, apply_synonymizer_to_terms + +from tests import CYTOPLASM, INPUT_DIR, NUCLEAR_MEMBRANE, NUCLEUS + +TEST_SIMPLE_ONT = INPUT_DIR / "go-nucleus-simple.obo" + + +@pytest.mark.parametrize( + "rule,input,expected", + [ + ( + {"match": "world", "replacement": "universe"}, + "hello world", + [(True, "hello universe", None)], + ), + ( + {"match": "world", "replacement": "universe", "qualifier": "broad"}, + "hello world", + [(True, "hello universe", "broad")], + ), + ( + {"match": "world", "replacement": "universe"}, + "hello universe", + [(False, "hello universe", None)], + ), + ( + {"match": r"hello (\w+)", "replacement": r"\1, hello"}, + "hello world", + [(True, "world, hello", None)], + ), + ({"match": r"\bfoo\b", "replacement": "bar"}, "foo baz", [(True, "bar baz", None)]), + ({"match": r"\bfoo\b", "replacement": "bar"}, "foo-baz", [(True, "bar-baz", None)]), + ({"match": r"\bfoo\b", "replacement": "bar"}, "", [(True, "", None)]), + ({"match": r"\bfoo\b", "replacement": "bar"}, "baz foo", [(True, "baz bar", None)]), + ({"match": r"\bfoo\b", "replacement": "bar"}, "food baz", [(False, "food baz", None)]), + ], +) +def test_synonymizer(rule, input, expected): + s = Synonymizer(**rule) + results = list(apply_synonymizer(input, [s])) + assert results == expected + + +@pytest.mark.parametrize( + "ruleset,include_all,terms,expected", + [ + ( + Synonymizer(match=r"nuclear (\w+)", replacement=r"\1 of nucleus"), + False, + [NUCLEUS, NUCLEAR_MEMBRANE], + ["create exact synonym 'membrane of nucleus' for GO:0031965"], + ), + ( + Synonymizer(match=r"nuclear (\w+)", replacement=r"\1 of nucleus", match_scope="label"), + False, + [NUCLEUS, NUCLEAR_MEMBRANE], + ["create exact synonym 'membrane of nucleus' for GO:0031965"], + ), + ( + Synonymizer(match=r"nuclear (\w+)", replacement=r"\1 of nucleus", match_scope="exact"), + False, + [NUCLEUS, NUCLEAR_MEMBRANE], + [], + ), + ( + Synonymizer( + match=r"nucleus", + replacement="NUCLEUS", + match_scope="definition", + qualifier="definition", + ), + True, + [CYTOPLASM], + [ + ( + "change definition of GO:0005737 from All of the contents of a cell excluding " + "the plasma membrane and nucleus, but including other subcellular structures. " + "to All of the contents of a cell excluding the plasma membrane and " + "NUCLEUS, but including other subcellular structures." + ) + ], + ), + ], +) +def test_syonymizer_on_terms(ruleset, include_all, terms, expected): + adapter = get_adapter(TEST_SIMPLE_ONT) + if isinstance(ruleset, Synonymizer): + ruleset = RuleSet(rules=[ruleset]) + changes = list(apply_synonymizer_to_terms(adapter, terms, ruleset, include_all=include_all)) + changes_strs = [] + for change in changes: + change_str = render(change) + print(change_str) + changes_strs.append(change_str) + assert set(changes_strs) == set(expected)