Skip to content

Commit

Permalink
feat: Add spaCy integration as single-threaded pipeline component
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Jan 24, 2025
1 parent 30b2aec commit d549219
Show file tree
Hide file tree
Showing 7 changed files with 1,287 additions and 3 deletions.
2 changes: 1 addition & 1 deletion dwdsmor/automaton.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def __init__(self, automata, automaton_type="lemma"):

def __call__(self, word, **criteria):
traversals = tuple(self.analyzer.analyze(word))
criteria_stack = list(criteria.items())
criteria_stack = list((k, v) for k, v in criteria.items() if v)
criteria_stack.reverse()
while criteria_stack:
if len(traversals) == 1:
Expand Down
76 changes: 76 additions & 0 deletions dwdsmor/spacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from collections import OrderedDict
from functools import cache
from typing import Iterable

from spacy.language import Language
from spacy.tokens.token import Token

import dwdsmor.tag.hdt as hdt

from . import lemmatizer
from .automaton import Lemmatizer

Token.set_extension("dwdsmor_lemma", default=None)


def criterion(k, v, mapping):
return (k, mapping.get(v, {v}) if v else None)


@cache
def criteria(pos, number, gender, case, person, tense, degree, mood, nonfinite):
return OrderedDict(
(
criterion("pos", pos, hdt.pos_map),
criterion("number", number, hdt.number_map),
criterion("gender", gender, hdt.gender_map),
criterion("case", case, hdt.case_map),
criterion("person", person, hdt.person_map),
criterion("tense", tense, hdt.tense_map),
criterion("degree", degree, hdt.degree_map),
criterion("mood", mood, hdt.mood_map),
criterion("nonfinite", nonfinite, hdt.nonfinite_map),
)
)


def morph(token_morph, k):
v = ",".join(token_morph.get(k))
return v if v else None


def lemmatize_token(lemmatizer: Lemmatizer, token: Token):
token_morph = token.morph
token_criteria = criteria(
token.tag_,
morph(token_morph, "Number"),
morph(token_morph, "Gender"),
morph(token_morph, "Case"),
morph(token_morph, "Person"),
morph(token_morph, "Tense"),
morph(token_morph, "Degree"),
morph(token_morph, "Mood"),
morph(token_morph, "VerbForm"),
)
token._.dwdsmor_lemma = lemmatizer(token.text, **token_criteria)
return token


def lemmatize(lemmatizer: Lemmatizer, tokens: Iterable[Token]):
for token in tokens:
lemmatize_token(lemmatizer, token)
return tokens


class Component:
def __init__(self, automata_location=None):
self.lemmatizer = lemmatizer(automata_location)

def __call__(self, doc):
lemmatize(self.lemmatizer, doc)
return doc


@Language.factory("dwdsmor", default_config={"automata_location": None})
def create_component(nlp: Language, name: str, automata_location: str | None):
return Component(automata_location)
File renamed without changes.
105 changes: 105 additions & 0 deletions dwdsmor/tag/hdt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
pos_map = {
"$(": {"+PUNCT"},
"$,": {"+PUNCT"},
"$.": {"+PUNCT"},
"ADJA": {"+ADJ", "+CARD", "+INDEF", "+ORD"},
"ADJD": {"+ADJ"},
"ADV": {"+ADV"},
"APPO": {"+POSTP"},
"APPR": {"+PREP"},
"APPR_ART": {"+PREPART"},
"APZR": {"+POSTP", "+PREP"},
"ART": {"+ART"},
"CARD": {"+CARD"},
"FM": {"+FM"}, # ?
"ITJ": {"+INTJ"},
"KOKOM": {"+CONJ"},
"KON": {"+CONJ"},
"KOUI": {"+CONJ"},
"KOUS": {"+CONJ"},
"NE": {"+NN", "+NPROP"},
"NN": {"+NN", "+NPROP"},
"PDAT": {"+DEM"},
"PDS": {"+DEM"},
"PIAT": {"+INDEF"},
"PIDAT": {"+INDEF"},
"PIS": {"+INDEF"},
"PPER": {"+PPRO"},
"PPOSAT": {"+POSS"},
"PPOSS": {"+POSS"},
"PRELAT": {"+REL"},
"PRELS": {"+REL"},
"PRF": {"+PPRO"},
"PROAV": {"+ADV", "+PROADV"},
"PTKA": {"+PTCL"},
"PTKANT": {"+INTJ", "+PTCL"},
"PTKNEG": {"+PTCL"},
"PTKVZ": {"+ADV", "+PREP", "+VPART"},
"PTKZU": {"+PTCL"},
"PWAT": {"+WPRO"},
"PWAV": {"+ADV"},
"PWS": {"+WPRO"},
"TRUNC": {"+TRUNC"}, # ?
"VAFIN": {"+V"},
"VAIMP": {"+V"},
"VAINF": {"+V"},
"VAPP": {"+V"},
"VMFIN": {"+V"},
"VMINF": {"+V"},
"VMPP": {"+V"},
"VVFIN": {"+V"},
"VVIMP": {"+V"},
"VVINF": {"+V"},
"VVIZU": {"+V"},
"VVPP": {"+V"},
"XY": {"+XY"}, # ?
}

number_map = {
"Sing": {"Sg"},
"Plur": {"Pl"},
}


gender_map = {
"Masc,Neut": {"Masc", "Neut"},
"Neut": {"Neut"},
"Fem": {"Fem"},
"Masc": {"Masc"},
}

case_map = {
"Nom": {"Nom"},
"Gen": {"Gen"},
"Dat": {"Dat"},
"Acc": {"Acc"},
}

person_map = {
"1": {"1"},
"2": {"2"},
"3": {"3"},
}

tense_map = {
"Past": {"Past"},
"Pres": {"Pres"},
}


degree_map = {
"Cmp": {"Comp"},
"Sup": {"Sup"},
"Pos": {"Pos"},
}

mood_map = {
"Ind": {"Ind"},
"Imp": {"Imp"},
}

# VerbForm
nonfinite_map = {
"Part": {"Part"},
"Inf": {"Inf"},
}
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ dev = [
"pytest",
"syrupy",
"tqdm",
"Jinja2"
]
"Jinja2",
"de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"]

[project.scripts]
dwdsmor = "dwdsmor.cli:main"
Expand Down
Loading

0 comments on commit d549219

Please sign in to comment.