Skip to content

Commit

Permalink
synonymizer refactor2 (#728)
Browse files Browse the repository at this point in the history
* synonymizer refactoring.

Moving synonymizer from mapping rules to its own data model.
Moving logic from CLI to utils

* Command line

* Add missing
  • Loading branch information
cmungall authored Mar 28, 2024
1 parent db4c2c8 commit 06f1220
Show file tree
Hide file tree
Showing 15 changed files with 1,189 additions and 162 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ src/oaklib/datamodels/%.py: src/oaklib/datamodels/%.yaml
# $(RUN) gen-pydantic $< > [email protected] && mv [email protected] $@
$(RUN) gen-python $< > [email protected] && mv [email protected] $@
$(RUN) tox -e lint

src/oaklib/datamodels/synonymizer.py: src/oaklib/datamodels/synonymizer.yaml
$(RUN) gen-pydantic $< > $@.tmp && mv $@.tmp $@


src/oaklib/datamodels/%.schema.json: src/oaklib/datamodels/%.yaml
$(RUN) gen-json-schema $< > $@.tmp && mv $@.tmp $@
src/oaklib/datamodels/%.owl.ttl: src/oaklib/datamodels/%.yaml
Expand Down
148 changes: 107 additions & 41 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@
import oaklib.datamodels.taxon_constraints as tcdm
from oaklib import datamodels
from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener
from oaklib.datamodels import synonymizer_datamodel
from oaklib.datamodels.association import RollupGroup
from oaklib.datamodels.cross_ontology_diff import DiffCategory
from oaklib.datamodels.lexical_index import LexicalTransformation, TransformationType
from oaklib.datamodels.obograph import (
BasicPropertyValue,
Edge,
Expand Down Expand Up @@ -167,11 +167,9 @@
parse_kgcl_files,
write_kgcl,
)
from oaklib.utilities.lexical import patternizer
from oaklib.utilities.lexical import patternizer, synonymizer
from oaklib.utilities.lexical.lexical_indexer import (
DEFAULT_QUALIFIER,
add_labels_from_uris,
apply_transformation,
create_lexical_index,
lexical_index_to_sssom,
load_lexical_index,
Expand Down Expand Up @@ -6496,46 +6494,114 @@ def generate_synonyms(terms, rules_file, apply_patch, patch, patch_format, outpu
else:
writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl)
writer.output = output
# TODO: Eventually get this from settings as above
ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file)))
change_list = []
curie_iter = query_terms_iterator(terms, impl)
for change in synonymizer.apply_synonymizer_to_terms(impl, curie_iter, ruleset):
change_list.append(change)
writer.emit(change)

writer.finish()
if apply_patch and len(change_list) > 0:
if output:
impl.resource.slug = output
_apply_changes(impl, change_list)


@main.command()
@click.argument("terms", nargs=-1)
@click.option(
"--rules-file",
"-R",
help="path to rules file. Conforms to rules_datamodel.\
e.g. https://github.com/INCATools/ontology-access-kit/blob/main/tests/input/matcher_rules.yaml",
)
@click.option(
"--rules-expression",
"-Y",
multiple=True,
help="YAML encoding of a rules expression",
)
@click.option(
"--apply-patch/--no-apply-patch",
default=False,
show_default=True,
help="Apply KGCL syntax generated based on the synonymizer rules file.",
)
@click.option(
"--patch",
type=click.File(mode="w"),
default=sys.stdout,
help="Path to where patch file will be written.",
)
@click.option(
"--patch-format",
help="Output syntax for patches.",
)
@output_option
@output_type_option
def generate_lexical_replacements(
terms, rules_file, rules_expression, apply_patch, patch, patch_format, output, output_type
):
"""
Generate lexical replacements based on a set of synonymizer rules.
If the `--apply-patch` flag is set, the output will be an ontology file with the changes
applied. Pass the `--patch` argument to lso get the patch file in KGCL format.
Example:
-------
runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml\
--patch patch.kgcl --apply-patch -o foo_syn.obo
If the `apply-patch` flag is NOT set then the main input will be KGCL commands
Example:
-------
runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml -o changes.kgcl
You can also pass the expressions directly as YAML
Example:
-------
runoak -i foo.obo generate-lexical-replacements \
-Y '{match: "nuclear (\\w+)", replacement: "\\1 nucleus"}' .all
see https://github.com/INCATools/kgcl.
Note: this command is very similar to generate-synonyms, but the main use case here
is replacing terms, and applying rules to other elements such as definitions
"""
impl = settings.impl
if apply_patch:
writer = _get_writer(patch_format, impl, StreamingKGCLWriter, kgcl)
writer.output = patch
else:
writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl)
writer.output = output
if rules_file:
ruleset = load_mapping_rules(rules_file)
ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file)))
elif rules_expression:
ruleset = synonymizer_datamodel.RuleSet()
for rule_expression in rules_expression:
rule = synonymizer_datamodel.Synonymizer(**yaml.safe_load(rule_expression))
ruleset.rules.append(rule)
else:
ruleset = None
if not isinstance(impl, OboGraphInterface):
raise NotImplementedError
syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer]
terms_to_synonymize = {}
raise ValueError("Must specify either --rules-file or --rules-expression")
change_list = []
for curie in query_terms_iterator(terms, impl):
# for rule in syn_rules:
for _, aliases in impl.entity_alias_map(curie).items():
matches = []
if aliases is not None:
# matches.extend([x for x in aliases if re.search(eval(rule.match), x) is not None])
for alias in aliases:
if alias:
synonymized, new_alias, qualifier = apply_transformation(
alias,
LexicalTransformation(
TransformationType.Synonymization, params=syn_rules
),
)
if synonymized:
matches.append(new_alias)

if len(matches) > 0:
if qualifier is None or qualifier == "":
qualifier = DEFAULT_QUALIFIER
terms_to_synonymize[curie] = matches
change = kgcl.NewSynonym(
id="kgcl_change_id_" + str(len(terms_to_synonymize)),
about_node=curie,
old_value=alias,
new_value=new_alias,
qualifier=qualifier,
)
change_list.append(change)
writer.emit(change)
curie_iter = query_terms_iterator(terms, impl)
for change in synonymizer.apply_synonymizer_to_terms(
impl, curie_iter, ruleset, include_all=True
):
change_list.append(change)
writer.emit(change)

writer.finish()
if apply_patch and len(change_list) > 0:
if output:
Expand Down
82 changes: 4 additions & 78 deletions src/oaklib/datamodels/mapping_rules_datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,33 @@
# license: https://creativecommons.org/publicdomain/zero/1.0/

import dataclasses
import re
import sys
from dataclasses import dataclass
from typing import Any, ClassVar, Dict, List, Optional, Union

from jsonasobj2 import JsonObj, as_dict
from jsonasobj2 import as_dict
from linkml_runtime.linkml_model.meta import (
EnumDefinition,
PermissibleValue,
PvFormulaOptions,
)
from linkml_runtime.linkml_model.types import Boolean, Float, String, Uriorcurie
from linkml_runtime.utils.curienamespace import CurieNamespace
from linkml_runtime.utils.dataclass_extensions_376 import (
dataclasses_init_fn_with_kwargs,
)
from linkml_runtime.utils.enumerations import EnumDefinitionImpl
from linkml_runtime.utils.formatutils import camelcase, sfx, underscore
from linkml_runtime.utils.metamodelcore import (
Bool,
URIorCURIE,
bnode,
empty_dict,
empty_list,
)
from linkml_runtime.utils.slot import Slot
from linkml_runtime.utils.yamlutils import (
YAMLRoot,
extended_float,
extended_int,
extended_str,
)
from rdflib import Namespace, URIRef
from rdflib import URIRef

from oaklib.datamodels.synonymizer_datamodel import Synonymizer, Test

metamodel_version = "1.7.0"
version = None
Expand Down Expand Up @@ -253,74 +247,6 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
super().__post_init__(**kwargs)


@dataclass
class Synonymizer(YAMLRoot):
_inherited_slots: ClassVar[List[str]] = []

class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer
class_class_curie: ClassVar[str] = "mappingrules:Synonymizer"
class_name: ClassVar[str] = "Synonymizer"
class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer

the_rule: Optional[str] = None
match: Optional[str] = None
match_scope: Optional[str] = None
replacement: Optional[str] = None
qualifier: Optional[str] = None
prefix: Optional[str] = None
tests: Optional[Union[dict, "Test"]] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.the_rule is not None and not isinstance(self.the_rule, str):
self.the_rule = str(self.the_rule)

if self.match is not None and not isinstance(self.match, str):
self.match = str(self.match)

if self.match_scope is not None and not isinstance(self.match_scope, str):
self.match_scope = str(self.match_scope)

if self.replacement is not None and not isinstance(self.replacement, str):
self.replacement = str(self.replacement)

if self.qualifier is not None and not isinstance(self.qualifier, str):
self.qualifier = str(self.qualifier)

if self.prefix is not None and not isinstance(self.prefix, str):
self.prefix = str(self.prefix)

if self.tests is not None and not isinstance(self.tests, Test):
self.tests = Test(**as_dict(self.tests))

super().__post_init__(**kwargs)


@dataclass
class Test(YAMLRoot):
_inherited_slots: ClassVar[List[str]] = []

class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Test
class_class_curie: ClassVar[str] = "mappingrules:Test"
class_name: ClassVar[str] = "Test"
class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Test

input: Optional[str] = None
output: Optional[str] = None
prefix: Optional[str] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.input is not None and not isinstance(self.input, str):
self.input = str(self.input)

if self.output is not None and not isinstance(self.output, str):
self.output = str(self.output)

if self.prefix is not None and not isinstance(self.prefix, str):
self.prefix = str(self.prefix)

super().__post_init__(**kwargs)


@dataclass
class LexicalIndex(YAMLRoot):
"""
Expand Down
6 changes: 3 additions & 3 deletions src/oaklib/datamodels/mapping_rules_datamodel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ id: https://w3id.org/oak/mapping-rules-datamodel
title: Mapping Rules Datamodel
name: mapping-rules-datamodel
description: >-
A datamodel for specifying lexical mapping rules.
A datamodel for specifying lexical mapping rules
license: https://creativecommons.org/publicdomain/zero/1.0/

prefixes:
Expand Down Expand Up @@ -30,6 +30,7 @@ emit_prefixes:
imports:
- linkml:types
- lexical_index
- synonymizer_datamodel



Expand Down Expand Up @@ -86,7 +87,6 @@ classes:
predicate_id_one_of:
multivalued: true


Postcondition:
attributes:
predicate_id:
Expand All @@ -101,7 +101,7 @@ classes:

Synonymizer:
attributes:
the_rule:
description:
description: Description of the rule.
range: string
match:
Expand Down
Loading

0 comments on commit 06f1220

Please sign in to comment.