Skip to content

Commit

Permalink
Merge pull request #942 from tatuylonen/martinize
Browse files Browse the repository at this point in the history
[en] Form descriptions: check if a 'form' has too many words
  • Loading branch information
kristian-clausal authored Dec 9, 2024
2 parents f943188 + 2f9faf9 commit ea2043b
Showing 1 changed file with 13 additions and 0 deletions.
13 changes: 13 additions & 0 deletions src/wiktextract/extractor/en/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2440,13 +2440,25 @@ def strokes_repl(m: re.Match) -> str:
or any("error-unknown-tag" in x for x in tagsets)
):
if alt_related is not None:
# We already had a good division, so let's stop.
break
# Bad division, try deeper
continue
if (
i > 1
and len(parts[i - 1]) >= 4
and distw(titleparts, parts[i - 1]) <= 0.4
# Fixes wiktextract #983, where "participle"
# was too close to "Martinize" and so this accepted
# ["participle", "Martinize"] as matching; this
# kludge prevents this from happening if titleparts
# is shorter than what would be 'related'.
# This breaks if we want to detect stuff that
# actually gets an extra space-separated word when
# 'inflected'.
and len(titleparts) >= len(parts[i - 1:])
):
# print(f"Reached; {parts=}, {parts[i-1]=}")
alt_related = related
alt_tagsets = tagsets
continue
Expand Down Expand Up @@ -2491,6 +2503,7 @@ def strokes_repl(m: re.Match) -> str:
tagsets = alt_tagsets

# print("FORM END: tagsets={} related={}".format(tagsets, related))
# print("==================")
if not tagsets:
continue

Expand Down

0 comments on commit ea2043b

Please sign in to comment.