Skip to content

Commit

Permalink
Added BIN_Db.lookup_forms()
Browse files Browse the repository at this point in the history
  • Loading branch information
vthorsteinsson committed Jan 29, 2021
1 parent 0f5483a commit c000107
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 8 deletions.
21 changes: 13 additions & 8 deletions src/reynir/bincompress.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def lookup_case(
*,
singular: bool = False,
indefinite: bool = False,
all_forms: bool = False,
cat: Optional[str] = None,
stem: Optional[str] = None,
utg: Any = NoUtg,
Expand All @@ -375,6 +376,8 @@ def lookup_case(
# singular=False does not force the result to be plural; it
# simply means that no forcing to singular occurs.
# The same applies to indefinite=True and False, mutatis mutandis.
# However, if all_forms=True, both singular and plural, as well as
# definite and indefinite forms, are always returned.

result: Set[MeaningTuple] = set()
case_latin = case.encode("latin-1")
Expand All @@ -395,10 +398,10 @@ def simplify_beyging(beyging: str) -> str:
# being specified.
for s in ("NF", "ÞF", "ÞGF", "EF", "2", "3"):
beyging = beyging.replace(s, "")
if singular:
if singular or all_forms:
for s in ("ET", "FT"):
beyging = beyging.replace(s, "")
if indefinite:
if indefinite or all_forms:
beyging = beyging.replace("gr", "")
# For adjectives, we neutralize weak and strong
# declension ('VB', 'SB'), but keep the degree (F, M, E)
Expand All @@ -413,12 +416,14 @@ def beyging_func(beyging: str) -> bool:
# for from self.lookup(), so we need to be careful to
# filter again on the case
return False
if singular and ("ET" not in beyging):
return False
if indefinite and any(b in beyging for b in ("gr", "FVB", "EVB")):
# For indefinite forms, we don't want the attached definite
# article ('gr') or weak declensions of adjectives
return False
if not all_forms:
if singular and ("ET" not in beyging):
# Only return singular forms
return False
if indefinite and any(b in beyging for b in ("gr", "FVB", "EVB")):
# For indefinite forms, we don't want the attached definite
# article ('gr') or weak declensions of adjectives
return False
if beyging_filter is not None and not beyging_filter(beyging):
# The user-defined filter fails: return False
return False
Expand Down
11 changes: 11 additions & 0 deletions src/reynir/bindb.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,17 @@ def match(m: BIN_Meaning) -> bool:

return final_w, [m for m in meanings if match(m)]

def lookup_forms(self, lemma: str, cat: str, case: str) -> List[BIN_Meaning]:
""" Lookup all base forms of a particular lemma, in the indicated case.
This is mainly used to retrieve inflection forms of nouns, where
we want to retrieve singular and plural, definite and indefinite
forms in particular cases. """
assert self._compressed_bin is not None
mset = self._compressed_bin.lookup_case(
lemma, case.upper(), stem=lemma, cat=cat, all_forms=True
)
return list(map(BIN_Meaning._make, mset))

@lru_cache(maxsize=CACHE_SIZE)
def lookup_name_gender(self, name: str) -> str:
""" Given a person name, lookup its gender """
Expand Down
3 changes: 3 additions & 0 deletions src/reynir/bintokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,8 @@ def annotate(
w_new, m = db.lookup_word(
"".join(parts), at_sentence_start, auto_uppercase
)
else:
w_new = "" # Included to silence warning about unbound variable
if m:
# Found without hyphens: use that word form
m = [
Expand Down Expand Up @@ -1489,6 +1491,7 @@ def process(self, token_stream: TokenIterator) -> TokenIterator:
# Phrases we're considering
state: StateDict = defaultdict(list)
pdict = self._pdict # The phrase dictionary
token: Optional[Tok]

try:

Expand Down
44 changes: 44 additions & 0 deletions test/test_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,49 @@ def test_casting():
assert db.cast_to_genitive("Kópavogur", meaning_filter_func=f) == "Kópavogs"


def test_forms():
from reynir.bindb import BIN_Db
db = BIN_Db()
l = db.lookup_forms("köttur", "kvk", "nf")
assert len(l) == 0
l = db.lookup_forms("köttur", "kzk", "nf")
assert len(l) == 0
try:
l = []
l = db.lookup_forms("köttur", "kk", "zf")
except AssertionError:
pass
assert len(l) == 0
l = db.lookup_forms("kötur", "kk", "nf")
assert len(l) == 0
l = db.lookup_forms("kettirnir", "kk", "nf")
assert len(l) == 0
l = db.lookup_forms("köttur", "kk", "nf")
om = set(m.ordmynd for m in l)
assert "köttur" in om
assert "kettir" in om
assert "kötturinn" in om
assert "kettirnir" in om
l = db.lookup_forms("köttur", "kk", "þf")
om = set(m.ordmynd for m in l)
assert "kött" in om
assert "ketti" in om
assert "köttinn" in om
assert "kettina" in om
l = db.lookup_forms("köttur", "kk", "þgf")
om = set(m.ordmynd for m in l)
assert "ketti" in om
assert "köttum" in om
assert "kettinum" in om
assert "köttunum" in om
l = db.lookup_forms("köttur", "kk", "ef")
om = set(m.ordmynd for m in l)
assert "kattar" in om
assert "kattarins" in om
assert "katta" in om
assert "kattanna" in om


def test_addresses():
from reynir import NounPhrase
np = NounPhrase("Laugavegi 20b")
Expand Down Expand Up @@ -451,5 +494,6 @@ def test_addresses():
test_cases(r)
test_noun_phrases(r)
test_casting()
test_forms()
test_addresses()
r.__class__.cleanup()

0 comments on commit c000107

Please sign in to comment.