Skip to content

Commit

Permalink
Fixes coverage test
Browse files Browse the repository at this point in the history
  • Loading branch information
gremid committed Nov 26, 2024
1 parent faded6e commit f57d2cc
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions dwdsmor/build/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,22 +115,22 @@ def compute_coverage(automata, limit=None, show_progress=False):
mismatches = defaultdict(Counter)
for token in tokens:
form, lemma, xpos = token
pos_candidates = {f"+{xpos}"}.union(dwdsmor_pos_tags.get(xpos, {}))
is_match = lemmatizer(form, pos_candidates) == lemma
pos_candidates = {f"+{xpos}"}.union(dwdsmor_pos_tags.get(xpos, set()))
is_match = lemmatizer(form, pos_candidates) is not None
if not is_match and lemmatizer(lemma) is not None:
# skip tokens where we can analyze the given lemma but not the form:
# compounds are lemmatized to their basic words in German-UD/HDT
continue
registry = matches if is_match else mismatches
registry[xpos][lemma] += 1
registry[xpos][form] += 1

coverage = []

total_tokens = 0
total_types = 0
total_type_matches = 0
total_token_matches = 0
for registry in [matches, mismatches]:
for registry in (matches, mismatches):
for _pos_tag, types in registry.items():
for _type, token_count in types.items():
total_types += 1
Expand All @@ -141,7 +141,7 @@ def compute_coverage(automata, limit=None, show_progress=False):
tag_types = 0
tag_token_matches = 0
tag_type_matches = 0
for registry, is_match in [(matches, True), (mismatches, False)]:
for registry, is_match in ((matches, True), (mismatches, False)):
for _type, token_count in registry.get(pos_tag, {}).items():
tag_types += 1
tag_tokens += token_count
Expand Down

0 comments on commit f57d2cc

Please sign in to comment.