From d549219dc8b929b503cc79e8be2a0a2a12cc7625 Mon Sep 17 00:00:00 2001 From: Gregor Middell Date: Fri, 24 Jan 2025 16:28:54 +0100 Subject: [PATCH] feat: Add spaCy integration as single-threaded pipeline component --- dwdsmor/automaton.py | 2 +- dwdsmor/spacy.py | 76 ++ dwdsmor/{tag.py => tag/__init__.py} | 0 dwdsmor/tag/hdt.py | 105 +++ pyproject.toml | 4 +- test/__snapshots__/test_spacy.ambr | 1067 +++++++++++++++++++++++++++ test/test_spacy.py | 36 + 7 files changed, 1287 insertions(+), 3 deletions(-) create mode 100644 dwdsmor/spacy.py rename dwdsmor/{tag.py => tag/__init__.py} (100%) create mode 100644 dwdsmor/tag/hdt.py create mode 100644 test/__snapshots__/test_spacy.ambr create mode 100644 test/test_spacy.py diff --git a/dwdsmor/automaton.py b/dwdsmor/automaton.py index 8c89a02..5720b8e 100644 --- a/dwdsmor/automaton.py +++ b/dwdsmor/automaton.py @@ -209,7 +209,7 @@ def __init__(self, automata, automaton_type="lemma"): def __call__(self, word, **criteria): traversals = tuple(self.analyzer.analyze(word)) - criteria_stack = list(criteria.items()) + criteria_stack = list((k, v) for k, v in criteria.items() if v) criteria_stack.reverse() while criteria_stack: if len(traversals) == 1: diff --git a/dwdsmor/spacy.py b/dwdsmor/spacy.py new file mode 100644 index 0000000..08d18cf --- /dev/null +++ b/dwdsmor/spacy.py @@ -0,0 +1,76 @@ +from collections import OrderedDict +from functools import cache +from typing import Iterable + +from spacy.language import Language +from spacy.tokens.token import Token + +import dwdsmor.tag.hdt as hdt + +from . import lemmatizer +from .automaton import Lemmatizer + +Token.set_extension("dwdsmor_lemma", default=None) + + +def criterion(k, v, mapping): + return (k, mapping.get(v, {v}) if v else None) + + +@cache +def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite): + return OrderedDict( + ( + criterion("pos", pos, hdt.pos_map), + criterion("number", number, hdt.number_map), + criterion("gender", gender, hdt.gender_map), + criterion("case", case, hdt.case_map), + criterion("person", person, hdt.person_map), + criterion("tense", tense, hdt.tense_map), + criterion("degree", degree, hdt.degree_map), + criterion("mood", mood, hdt.mood_map), + criterion("nonfinite", nonfinite, hdt.nonfinite_map), + ) + ) + + +def morph(token_morph, k): + v = ",".join(token_morph.get(k)) + return v if v else None + + +def lemmatize_token(lemmatizer: Lemmatizer, token: Token): + token_morph = token.morph + token_criteria = criteria( + token.tag_, + morph(token_morph, "Number"), + morph(token_morph, "Gender"), + morph(token_morph, "Case"), + morph(token_morph, "Person"), + morph(token_morph, "Tense"), + morph(token_morph, "Degree"), + morph(token_morph, "Mood"), + morph(token_morph, "VerbForm"), + ) + token._.dwdsmor_lemma = lemmatizer(token.text, **token_criteria) + return token + + +def lemmatize(lemmatizer: Lemmatizer, tokens: Iterable[Token]): + for token in tokens: + lemmatize_token(lemmatizer, token) + return tokens + + +class Component: + def __init__(self, automata_location=None): + self.lemmatizer = lemmatizer(automata_location) + + def __call__(self, doc): + lemmatize(self.lemmatizer, doc) + return doc + + +@Language.factory("dwdsmor", default_config={"automata_location": None}) +def create_component(nlp: Language, name: str, automata_location: str | None): + return Component(automata_location) diff --git a/dwdsmor/tag.py b/dwdsmor/tag/__init__.py similarity index 100% rename from dwdsmor/tag.py rename to dwdsmor/tag/__init__.py diff --git a/dwdsmor/tag/hdt.py b/dwdsmor/tag/hdt.py new file mode 100644 index 0000000..6103223 --- /dev/null +++ b/dwdsmor/tag/hdt.py @@ -0,0 +1,105 @@ +pos_map = { + "$(": {"+PUNCT"}, + "$,": {"+PUNCT"}, + "$.": {"+PUNCT"}, + "ADJA": {"+ADJ", "+CARD", "+INDEF", "+ORD"}, + "ADJD": {"+ADJ"}, + "ADV": {"+ADV"}, + "APPO": {"+POSTP"}, + "APPR": {"+PREP"}, + "APPR_ART": {"+PREPART"}, + "APZR": {"+POSTP", "+PREP"}, + "ART": {"+ART"}, + "CARD": {"+CARD"}, + "FM": {"+FM"}, # ? + "ITJ": {"+INTJ"}, + "KOKOM": {"+CONJ"}, + "KON": {"+CONJ"}, + "KOUI": {"+CONJ"}, + "KOUS": {"+CONJ"}, + "NE": {"+NN", "+NPROP"}, + "NN": {"+NN", "+NPROP"}, + "PDAT": {"+DEM"}, + "PDS": {"+DEM"}, + "PIAT": {"+INDEF"}, + "PIDAT": {"+INDEF"}, + "PIS": {"+INDEF"}, + "PPER": {"+PPRO"}, + "PPOSAT": {"+POSS"}, + "PPOSS": {"+POSS"}, + "PRELAT": {"+REL"}, + "PRELS": {"+REL"}, + "PRF": {"+PPRO"}, + "PROAV": {"+ADV", "+PROADV"}, + "PTKA": {"+PTCL"}, + "PTKANT": {"+INTJ", "+PTCL"}, + "PTKNEG": {"+PTCL"}, + "PTKVZ": {"+ADV", "+PREP", "+VPART"}, + "PTKZU": {"+PTCL"}, + "PWAT": {"+WPRO"}, + "PWAV": {"+ADV"}, + "PWS": {"+WPRO"}, + "TRUNC": {"+TRUNC"}, # ? + "VAFIN": {"+V"}, + "VAIMP": {"+V"}, + "VAINF": {"+V"}, + "VAPP": {"+V"}, + "VMFIN": {"+V"}, + "VMINF": {"+V"}, + "VMPP": {"+V"}, + "VVFIN": {"+V"}, + "VVIMP": {"+V"}, + "VVINF": {"+V"}, + "VVIZU": {"+V"}, + "VVPP": {"+V"}, + "XY": {"+XY"}, # ? +} + +number_map = { + "Sing": {"Sg"}, + "Plur": {"Pl"}, +} + + +gender_map = { + "Masc,Neut": {"Masc", "Neut"}, + "Neut": {"Neut"}, + "Fem": {"Fem"}, + "Masc": {"Masc"}, +} + +case_map = { + "Nom": {"Nom"}, + "Gen": {"Gen"}, + "Dat": {"Dat"}, + "Acc": {"Acc"}, +} + +person_map = { + "1": {"1"}, + "2": {"2"}, + "3": {"3"}, +} + +tense_map = { + "Past": {"Past"}, + "Pres": {"Pres"}, +} + + +degree_map = { + "Cmp": {"Comp"}, + "Sup": {"Sup"}, + "Pos": {"Pos"}, +} + +mood_map = { + "Ind": {"Ind"}, + "Imp": {"Imp"}, +} + +# VerbForm +nonfinite_map = { + "Part": {"Part"}, + "Inf": {"Inf"}, +} diff --git a/pyproject.toml b/pyproject.toml index 1789476..ac7335f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,8 +39,8 @@ dev = [ "pytest", "syrupy", "tqdm", - "Jinja2" -] + "Jinja2", + "de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"] [project.scripts] dwdsmor = "dwdsmor.cli:main" diff --git a/test/__snapshots__/test_spacy.ambr b/test/__snapshots__/test_spacy.ambr new file mode 100644 index 0000000..5b39374 --- /dev/null +++ b/test/__snapshots__/test_spacy.ambr @@ -0,0 +1,1067 @@ +# serializer version: 1 +# name: test_lemmatisation + tuple( + tuple( + 'Sehr', + 'ADV', + 'Sehr', + 'sehr', + ), + tuple( + 'gute', + 'ADJA', + 'gut', + 'gut', + ), + tuple( + 'Beratung', + 'NN', + 'Beratung', + 'Beratung', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'schnelle', + 'ADJA', + 'schnell', + 'schnell', + ), + tuple( + 'Behebung', + 'NN', + 'Behebung', + 'Behebung', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Probleme', + 'NN', + 'Problem', + 'Problem', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'so', + 'ADV', + 'so', + 'so', + ), + tuple( + 'stelle', + 'VVFIN', + 'stellen', + 'stellen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'mir', + 'PRF', + 'ich', + 'ich', + ), + tuple( + 'Kundenservice', + 'NN', + 'Service', + 'Kundenservice', + ), + tuple( + 'vor', + 'PTKVZ', + 'vor', + 'vor', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Die', + 'ART', + 'der', + 'die', + ), + tuple( + 'Kosten', + 'NN', + 'Kosten', + 'Kosten', + ), + tuple( + 'sind', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'definitiv', + 'ADJD', + 'definitiv', + 'definitiv', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'im', + 'APPR_ART', + 'in der', + 'in', + ), + tuple( + 'Rahmen', + 'NN', + 'Rahmen', + 'Rahmen', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Nette', + 'ADJA', + 'Nette', + 'nett', + ), + tuple( + 'Gespräche', + 'NN', + 'Gespräch', + 'Gespräch', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'klasse', + 'ADJA', + 'klasse', + 'klasse', + ), + tuple( + 'Ergebnis', + 'NN', + 'Ergebnis', + 'Ergebnis', + ), + tuple( + 'Ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'bin', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'seit', + 'APPR', + 'seit', + 'seit', + ), + tuple( + 'längerer', + 'ADJA', + 'lang', + 'lang', + ), + tuple( + 'Zeit', + 'NN', + 'Zeit', + 'Zeit', + ), + tuple( + 'zur', + 'APPR_ART', + 'zu der', + 'zu', + ), + tuple( + 'Behandlung', + 'NN', + 'Behandlung', + 'Behandlung', + ), + tuple( + 'verschiedenster', + 'ADJA', + 'verschieden', + 'verschieden', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'Leiden', + 'NN', + 'Leiden', + 'Leiden', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'in', + 'APPR', + 'in', + 'in', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Physiotherapieraxis', + 'NN', + 'Physiotherapieraxis', + None, + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'Gaby', + 'NE', + 'Gaby', + None, + ), + tuple( + 'Montag', + 'NE', + 'Montag', + 'Montag', + ), + tuple( + '"', + '$(', + '"', + '"', + ), + tuple( + 'im', + 'APPR_ART', + 'in der', + 'in', + ), + tuple( + 'Vital', + 'NN', + 'Vital', + 'vital', + ), + tuple( + 'Center', + 'NN', + 'Center', + 'Center', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'kann', + 'VMFIN', + 'können', + 'können', + ), + tuple( + 'ausschließlich', + 'ADV', + 'ausschließlich', + 'ausschließlich', + ), + tuple( + 'Positives', + 'NN', + 'Positiv', + 'positiv', + ), + tuple( + 'berichten', + 'VVINF', + 'berichten', + 'berichten', + ), + tuple( + '!', + '$.', + '!', + '!', + ), + tuple( + 'Ob', + 'KOUS', + 'Ob', + 'ob', + ), + tuple( + 'bei', + 'APPR', + 'bei', + 'bei', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Terminvergabe', + 'NN', + 'Terminvergabe', + 'Terminvergabe', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'den', + 'ART', + 'der', + 'die', + ), + tuple( + 'Behandlungsräumen', + 'NN', + 'Behandlungsräumen', + 'Behandlungsraum', + ), + tuple( + 'oder', + 'KON', + 'oder', + 'oder', + ), + tuple( + 'den', + 'ART', + 'der', + 'die', + ), + tuple( + 'individuell', + 'ADJD', + 'individuell', + 'individuell', + ), + tuple( + 'zugeschnittenen', + 'ADJA', + 'zuschneiden', + 'zugeschnitten', + ), + tuple( + 'Trainingsplänen', + 'NN', + 'Trainingsplan', + 'Trainingsplan', + ), + tuple( + 'sind', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'alle', + 'PIDAT', + 'alle', + 'alle', + ), + tuple( + 'Mitarbeiter', + 'NN', + 'Mitarbeiter', + 'Mitarbeiter', + ), + tuple( + 'äußerst', + 'ADV', + 'äußerst', + 'äußern', + ), + tuple( + 'kompetent', + 'ADJD', + 'kompetent', + 'kompetent', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'flexibel', + 'ADJD', + 'flexibel', + 'flexibel', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Sauberkeit', + 'NN', + 'Sauberkeit', + 'Sauberkeit', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'Ordnung', + 'NN', + 'Ordnung', + 'Ordnung', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'Freundlichkeit', + 'NN', + 'Freundlichkeit', + 'Freundlichkeit', + ), + tuple( + 'brauche', + 'VVFIN', + 'brauchen', + 'brauchen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'hier', + 'ADV', + 'hier', + 'hier', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'zu', + 'PTKZU', + 'zu', + 'zu', + ), + tuple( + 'erwähnen', + 'VVINF', + 'erwähnen', + 'erwähnen', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'denn', + 'KON', + 'denn', + 'denn', + ), + tuple( + 'das', + 'PDS', + 'der', + 'die', + ), + tuple( + 'gehört', + 'VVFIN', + 'gehören', + 'gehören', + ), + tuple( + 'für', + 'APPR', + 'für', + 'für', + ), + tuple( + 'mich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'zum', + 'APPR_ART', + 'zu der', + 'zu', + ), + tuple( + 'Standard', + 'NN', + 'Standard', + 'Standard', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'der', + 'PRELS', + 'der', + 'die', + ), + tuple( + 'aber', + 'ADV', + 'aber', + 'aber', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'noch', + 'ADV', + 'noch', + 'noch', + ), + tuple( + 'übertroffen', + 'VVPP', + 'übertreffen', + 'übertreffen', + ), + tuple( + 'wird', + 'VAFIN', + 'werden', + 'werden', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Physiotherapie', + 'NE', + 'Physiotherapie', + 'Physiotherapie', + ), + tuple( + 'ist', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + 'zwar', + 'ADV', + 'zwar', + 'zwar', + ), + tuple( + 'oftmals', + 'ADV', + 'oftmals', + 'oftmals', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'anstrengend', + 'ADJD', + 'anstrengen', + 'anstrengend', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'aber', + 'KON', + 'aber', + 'aber', + ), + tuple( + 'in', + 'APPR', + 'in', + 'in', + ), + tuple( + 'dieser', + 'PDAT', + 'dieser', + 'diese', + ), + tuple( + 'Umgebeung', + 'NN', + 'Umgebeung', + None, + ), + tuple( + 'freut', + 'VVFIN', + 'freuen', + 'freuen', + ), + tuple( + 'man', + 'PIS', + 'man', + 'man', + ), + tuple( + 'sich', + 'PRF', + 'sich', + 'sich', + ), + tuple( + 'auf', + 'APPR', + 'auf', + 'auf', + ), + tuple( + 'jede', + 'PIDAT', + 'jeder', + 'jede', + ), + tuple( + 'Minute', + 'NN', + 'Minute', + 'Minute', + ), + tuple( + 'Behandlung', + 'NN', + 'Behandlung', + 'Behandlung', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Das', + 'ART', + 'der', + 'die', + ), + tuple( + 'nächste', + 'ADJA', + 'NULL', + 'nahe', + ), + tuple( + 'mal', + 'ADV', + 'mal', + 'mal', + ), + tuple( + 'rief', + 'VVFIN', + 'rufen', + 'rufen', + ), + tuple( + 'ich', + 'PPER', + 'ich', + 'ich', + ), + tuple( + 'extra', + 'ADV', + 'extra', + 'extra', + ), + tuple( + 'vorher', + 'ADV', + 'vorher', + 'vorher', + ), + tuple( + 'an', + 'PTKVZ', + 'an', + 'an', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'um', + 'KOUI', + 'um', + 'um', + ), + tuple( + 'einen', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Termin', + 'NN', + 'Termin', + 'Termin', + ), + tuple( + 'zu', + 'PTKZU', + 'zu', + 'zu', + ), + tuple( + 'vereinbaren', + 'VVINF', + 'vereinbaren', + 'vereinbaren', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'damit', + 'KOUS', + 'damit', + 'damit', + ), + tuple( + 'der', + 'ART', + 'der', + 'die', + ), + tuple( + 'Konditor', + 'NN', + 'Konditor', + 'Konditor', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'Zeit', + 'NN', + 'Zeit', + 'Zeit', + ), + tuple( + 'für', + 'APPR', + 'für', + 'für', + ), + tuple( + 'uns', + 'PPER', + 'wir', + 'uns', + ), + tuple( + 'hätte', + 'VAFIN', + 'haben', + 'haben', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Eine', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Stunde', + 'NN', + 'Stunde', + 'Stunde', + ), + tuple( + 'später', + 'ADJD', + 'spät', + 'spät', + ), + tuple( + 'gab', + 'VVFIN', + 'geben', + 'geben', + ), + tuple( + 'man', + 'PIS', + 'man', + 'man', + ), + tuple( + 'uns', + 'PPER', + 'wir', + 'uns', + ), + tuple( + 'dann', + 'ADV', + 'dann', + 'dann', + ), + tuple( + 'endlich', + 'ADJD', + 'endlich', + 'endlich', + ), + tuple( + 'einen', + 'ART', + 'ein', + 'eine', + ), + tuple( + 'Tisch', + 'NN', + 'Tisch', + 'Tisch', + ), + tuple( + ',', + '$,', + ',', + ',', + ), + tuple( + 'der', + 'PRELS', + 'der', + 'die', + ), + tuple( + 'allerdings', + 'ADV', + 'allerdings', + 'allerdings', + ), + tuple( + 'noch', + 'ADV', + 'noch', + 'noch', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'einmal', + 'ADV', + 'einmal', + 'einmal', + ), + tuple( + 'abgeräumt', + 'VVPP', + 'abräumen', + 'abräumen', + ), + tuple( + 'war', + 'VAFIN', + 'sein', + 'sein', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + tuple( + 'Die', + 'ART', + 'der', + 'die', + ), + tuple( + 'Bedienung', + 'NN', + 'Bedienung', + 'Bedienung', + ), + tuple( + 'verschwand', + 'VVFIN', + 'verschwinden', + 'verschwinden', + ), + tuple( + 'sofort', + 'ADV', + 'sofort', + 'sofort', + ), + tuple( + 'wieder', + 'ADV', + 'wieder', + 'wieder', + ), + tuple( + 'und', + 'KON', + 'und', + 'und', + ), + tuple( + 'kam', + 'VVFIN', + 'kommen', + 'kommen', + ), + tuple( + 'auch', + 'ADV', + 'auch', + 'auch', + ), + tuple( + 'erstmal', + 'ADV', + 'erstmal', + 'erstmal', + ), + tuple( + 'nicht', + 'PTKNEG', + 'nicht', + 'nicht', + ), + tuple( + 'mehr', + 'ADV', + 'mehr', + 'sehr', + ), + tuple( + '.', + '$.', + '.', + '.', + ), + ) +# --- diff --git a/test/test_spacy.py b/test/test_spacy.py new file mode 100644 index 0000000..19c7b47 --- /dev/null +++ b/test/test_spacy.py @@ -0,0 +1,36 @@ +import spacy +from datasets import load_dataset +from pytest import fixture + +import dwdsmor +import dwdsmor.spacy + + +@fixture(scope="module") +def nlp(): + nlp = spacy.load("de_hdt_lg") + nlp.add_pipe("dwdsmor") + return nlp + + +@fixture(scope="module") +def lemmatizer(): + return dwdsmor.lemmatizer() + + +@fixture(scope="module") +def sentences(): + ds = load_dataset( + "universal_dependencies", + "de_gsd", + split="train", + trust_remote_code=True, + ) + return tuple(s["text"] for s in ds.select(range(100))) + + +def test_lemmatisation(nlp, lemmatizer, sentences, snapshot): + sentences = sentences[:10] + docs = nlp.pipe(sentences) + tokens = ((t.text, t.tag_, t.lemma_, t._.dwdsmor_lemma) for d in docs for t in d) + assert tuple(tokens) == snapshot