From 2f9faf95bb7a54a3d0f48a91867c9efe77676234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Mon, 9 Dec 2024 12:49:24 +0200 Subject: [PATCH] [en] Form descriptions: check if a 'form' has too many words Fixes #983: "present participle Martinize" was interpreted as tags ["present"] and the form "participle Martinize". This was because the code at: ```python if ( i > 1 and len(parts[i - 1]) >= 4 and distw(titleparts, parts[i - 1]) <= 0.4 ): alt_related = related alt_tagsets = tagsets continue ``` would see that "participle" is very close (exactly 0.4) in distw() Levenshtein distance to "Martinize" and accepted that it must be part of the titleparts list, like a word in a phrase. This kludge just adds a test to see if titleparts is shorter than the rest of `parts`, don't do that. THIS WILL FAIL IN CERTAIN SCENARIOS, like when an 'inflected' form gains more words! --- src/wiktextract/extractor/en/form_descriptions.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/wiktextract/extractor/en/form_descriptions.py b/src/wiktextract/extractor/en/form_descriptions.py index e39575928..b31cf10a5 100644 --- a/src/wiktextract/extractor/en/form_descriptions.py +++ b/src/wiktextract/extractor/en/form_descriptions.py @@ -2440,13 +2440,25 @@ def strokes_repl(m: re.Match) -> str: or any("error-unknown-tag" in x for x in tagsets) ): if alt_related is not None: + # We already had a good division, so let's stop. break + # Bad division, try deeper continue if ( i > 1 and len(parts[i - 1]) >= 4 and distw(titleparts, parts[i - 1]) <= 0.4 + # Fixes wiktextract #983, where "participle" + # was too close to "Martinize" and so this accepted + # ["participle", "Martinize"] as matching; this + # kludge prevents this from happening if titleparts + # is shorter than what would be 'related'. + # This breaks if we want to detect stuff that + # actually gets an extra space-separated word when + # 'inflected'. + and len(titleparts) >= len(parts[i - 1:]) ): + # print(f"Reached; {parts=}, {parts[i-1]=}") alt_related = related alt_tagsets = tagsets continue @@ -2491,6 +2503,7 @@ def strokes_repl(m: re.Match) -> str: tagsets = alt_tagsets # print("FORM END: tagsets={} related={}".format(tagsets, related)) + # print("==================") if not tagsets: continue