Skip to content

Commit

Permalink
Utils: Fix matching of inflected forms of tokens/n-grams with tags
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jan 3, 2025
1 parent e7c5a79 commit 60b0c8e
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 41 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
### 📌 Bugfixes
- File Area: Fix Open Files - Encoding
- File Area: Fix Open Files - Opening Non-text Files - Do not show this again
- Utils: Fix matching of inflected forms of tokens/n-grams with tags
- Utils: Fix Wordless's Japanese kanji tokenizer
- Work Area: Fix Wordlist Generator - Filter results - Number of syllables
- Work Area: Fix Wordlist Generator - Syllabification
Expand Down
124 changes: 106 additions & 18 deletions tests/tests_nlp/test_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import re

from tests import wl_test_init
from wordless.wl_nlp import wl_lemmatization, wl_matching, wl_texts
from wordless.wl_nlp import wl_matching, wl_texts

main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')

Expand Down Expand Up @@ -250,33 +250,70 @@ def test_match_tokens():
compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['tAke'],
tokens = wl_texts.to_tokens(['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['take', 'TAKE', 'Take', 'tAke', 'TaKE', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(match_case = True)
), ['tAke'])

compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['take'],
tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'taking', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(match_whole_words = True)
), ['take'])

tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us')
tokens = wl_lemmatization.wl_lemmatize(main, tokens, lang = 'eng_us')
compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['takes'],
tokens = tokens,
search_terms = ['took'],
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'taking', 'test'],
lemmas = ['take', 'take', 'take', 'take', 'take', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(match_inflected_forms = True)
), ['take', 'takes', 'took', 'taken', 'taking'])

compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['took_NN'],
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'test_NN'],
lang = 'eng_us',
tags = ['', '_NN', '_NN', '_NNP', ''],
lemmas = ['take', 'take', 'take', 'take', 'test']
),
lang = 'eng_us',
settings = init_search_settings(match_inflected_forms = True)
), ['take', 'takes', 'took', 'taken'])
), ['takes_NN', 'took_NN', 'taken_NNP'])

compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['took_NN'],
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'test_NN'],
lang = 'eng_us',
tags = ['', '_NN', '_NN', '_NNP', ''],
lemmas = ['take', 'take', 'take', 'take', 'test']
),
lang = 'eng_us',
settings = init_search_settings(match_whole_words = True, match_inflected_forms = True)
), ['takes_NN', 'took_NN'])

compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['take[sn]'],
tokens = wl_texts.to_tokens(['take', 'takes', 'took', 'taken', 'taking', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'taking', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(use_regex = True)
), ['takes', 'taken'])
Expand All @@ -287,7 +324,7 @@ def test_match_tokens():
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'test'],
lang = 'eng_us',
tags = ['', '_NN', '_NN', '_TAKES']
tags = ['', '_NN', '_NN', '_JJ']
),
lang = 'eng_us',
settings = init_search_settings(match_without_tags = True)
Expand Down Expand Up @@ -317,6 +354,18 @@ def test_match_tokens():
settings = init_search_settings(match_dependency_relations = True)
), ['aux', 'auxpass'])

compare_tokens_matched(wl_matching.match_tokens(
main,
search_terms = ['aux'],
tokens = wl_texts.to_tokens(
['take', 'takes', 'took', 'taken', 'test_NN'],
lang = 'eng_us',
dependency_relations = ['ROOT', 'nsubj', 'advmod', 'aux', 'auxpass']
),
lang = 'eng_us',
settings = init_search_settings(match_whole_words = True, match_dependency_relations = True)
), ['aux'])

def test_match_ngrams():
compare_ngrams_matched(wl_matching.match_ngrams(
main,
Expand All @@ -333,33 +382,70 @@ def test_match_ngrams():
compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['tAke WaLK'],
tokens = wl_texts.to_tokens(['take', 'tAke', 'WALK', 'WaLK', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['take', 'tAke', 'WALK', 'WaLK', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(match_case = True)
), [('tAke', 'WaLK')])

compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['take walk'],
tokens = wl_texts.to_tokens(['take', 'takes', 'walk', 'walked', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['take', 'takes', 'walk', 'walked', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(match_whole_words = True)
), [('take', 'walk')])

tokens = wl_texts.to_tokens(['take', 'takes', 'walk', 'walked', 'test'], lang = 'eng_us')
tokens = wl_lemmatization.wl_lemmatize(main, tokens, lang = 'eng_us')
compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['take walk'],
tokens = tokens,
tokens = wl_texts.to_tokens(
['take', 'takes', 'walk', 'walked', 'test'],
lang = 'eng_us',
lemmas = ['take', 'take', 'walk', 'walk', 'test'],
),
lang = 'eng_us',
settings = init_search_settings(match_inflected_forms = True)
), [('take', 'walk'), ('take', 'walked'), ('takes', 'walk'), ('takes', 'walked')])

compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['took_NN walked_NN'],
tokens = wl_texts.to_tokens(
['take', 'took', 'walk', 'walked', 'test'],
lang = 'eng_us',
tags = ['', '_NNP', '_NN', '_NNP', '_JJ'],
lemmas = ['take', 'take', 'walk', 'walk', 'test'],
),
lang = 'eng_us',
settings = init_search_settings(match_inflected_forms = True)
), [('took_NNP', 'walk_NN'), ('took_NNP', 'walked_NNP')])

compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['took_NN walked_NN'],
tokens = wl_texts.to_tokens(
['take', 'took', 'walk', 'walked', 'test'],
lang = 'eng_us',
tags = ['_NN', '_NNP', '_NN', '_NNP', '_JJ'],
lemmas = ['take', 'take', 'walk', 'walk', 'test'],
),
lang = 'eng_us',
settings = init_search_settings(match_whole_words = True, match_inflected_forms = True)
), [('take_NN', 'walk_NN')])

compare_ngrams_matched(wl_matching.match_ngrams(
main,
search_terms = ['took|taken walk(s|ing)'],
tokens = wl_texts.to_tokens(['took', 'taken', 'takes', 'walks', 'walking', 'walked', 'test'], lang = 'eng_us'),
tokens = wl_texts.to_tokens(
['took', 'taken', 'takes', 'walks', 'walking', 'walked', 'test'],
lang = 'eng_us'
),
lang = 'eng_us',
settings = init_search_settings(use_regex = True)
), [('took', 'walks'), ('took', 'walking'), ('taken', 'walks'), ('taken', 'walking')])
Expand Down Expand Up @@ -407,9 +493,11 @@ def test_match_search_terms_ngrams():
), [('take', 'walk')])

def init_context_settings(
incl = False, incl_multi_search_mode = False, incl_search_term = '', incl_search_terms = None,
incl = False,
incl_multi_search_mode = False, incl_search_term = '', incl_search_terms = None,
incl_context_window_left = -5, incl_context_window_right = 5,
excl = False, excl_multi_search_mode = False, excl_search_term = '', excl_search_terms = None,
excl = False,
excl_multi_search_mode = False, excl_search_term = '', excl_search_terms = None,
excl_context_window_left = -5, excl_context_window_right = 5,
):
incl_search_terms = incl_search_terms or []
Expand Down
66 changes: 46 additions & 20 deletions wordless/wl_nlp/wl_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,19 +195,29 @@ def match_tokens(

# Match inflected forms of search terms and search results
if settings['match_inflected_forms']:
lemmas_search = wl_texts.get_token_properties(tokens_search, 'lemma', convert_none = True)
lemmas_matched = wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, {*search_terms, *search_results}, lang),
'lemma',
convert_none = True
lemmas_search = tokens_search
lemmas_matched = list({*search_terms, *search_results})

# Match both lemmas and tags
wl_texts.set_token_texts(
lemmas_search,
wl_texts.get_token_properties(tokens_search, 'lemma', convert_none = True)
)
wl_texts.set_token_texts(
lemmas_matched,
wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, lemmas_matched, lang),
'lemma',
convert_none = True
)
)

for lemma_matched in set(lemmas_matched):
# Always match literal strings
lemma_matched = re.escape(lemma_matched)
lemma_matched = wl_texts.set_token_text(lemma_matched, re.escape(lemma_matched))

for token, lemma_search in set(zip(tokens, lemmas_search)):
if re_match(lemma_matched, lemma_search, flags = re_flags):
if re_match(lemma_matched.display_text(), lemma_search.display_text(), flags = re_flags):
search_results.add(token)

return search_results
Expand Down Expand Up @@ -253,37 +263,53 @@ def match_ngrams(
tokens_matched[search_term_token].add(token)

if settings['match_inflected_forms']:
lemmas_search = wl_texts.get_token_properties(tokens_search, 'lemma', convert_none = True)
lemmas_search = tokens_search

# Match both lemmas and tags
wl_texts.set_token_texts(
lemmas_search,
wl_texts.get_token_properties(tokens_search, 'lemma', convert_none = True)
)

# Search for inflected forms of tokens in search results first
for search_term_token, search_term_tokens_matched in copy.deepcopy(tokens_matched).items():
lemmas_matched = wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, search_term_tokens_matched, lang),
'lemma',
convert_none = True
lemmas_matched = list(search_term_tokens_matched)

wl_texts.set_token_texts(
lemmas_matched,
wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, search_term_tokens_matched, lang),
'lemma',
convert_none = True
)
)

for token_matched, lemma_matched in zip(search_term_tokens_matched, lemmas_matched):
# Always match literal strings
lemma_matched = re.escape(lemma_matched)
lemma_matched = wl_texts.set_token_text(lemma_matched, re.escape(lemma_matched))

for token, lemma_search in set(zip(tokens, lemmas_search)):
if re_match(lemma_matched, lemma_search, flags = re_flags):
if re_match(lemma_matched.display_text(), lemma_search.display_text(), flags = re_flags):
tokens_matched[search_term_token].add(token)

lemmas_matched = wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, search_term_tokens, lang),
'lemma',
convert_none = True
lemmas_matched = copy.deepcopy(search_term_tokens)

wl_texts.set_token_texts(
lemmas_matched,
wl_texts.get_token_properties(
wl_lemmatization.wl_lemmatize(main, search_term_tokens, lang),
'lemma',
convert_none = True
)
)

# Search for inflected forms of tokens in search terms
for token_matched, lemma_matched in zip(search_term_tokens, lemmas_matched):
# Always match literal strings
lemma_matched = re.escape(lemma_matched)
lemma_matched = wl_texts.set_token_text(lemma_matched, re.escape(lemma_matched))

for token, lemma_search in set(zip(tokens, lemmas_search)):
if re_match(lemma_matched, lemma_search, flags = re_flags):
if re_match(lemma_matched.display_text(), lemma_search.display_text(), flags = re_flags):
tokens_matched[token_matched].add(token)

for search_term in search_terms:
Expand Down
6 changes: 3 additions & 3 deletions wordless/wl_nlp/wl_texts.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ def to_tokens(
]

def display_texts_to_tokens(main, display_texts, lang = 'eng_us'):
re_tags = wl_matching.get_re_tags(main, tag_type = 'body')
re_tags = re.compile(wl_matching.get_re_tags(main, tag_type = 'body'))

tags = [''.join(re.findall(re_tags, display_text)) for display_text in display_texts]
texts = [re.sub(re_tags, '', display_text) for display_text in display_texts]
tags = [''.join(re_tags.findall(display_text)) for display_text in display_texts]
texts = [re_tags.sub('', display_text) for display_text in display_texts]

return to_tokens(texts, lang = lang, tags = tags)

Expand Down

0 comments on commit 60b0c8e

Please sign in to comment.