From 2d6662c6b35dc3b6edfb4072bce6dbf9788e18d3 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Thu, 2 Jan 2025 18:49:45 +0800 Subject: [PATCH] Work Area: Fix Wordlist Generator - Syllabification --- CHANGELOG.md | 1 + wordless/wl_nlp/wl_syl_tokenization.py | 59 +++++++++++++------------- 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee32bbc05..0cc37f115 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ - File Area: Fix Open Files - Opening Non-text Files - Do not show this again - Utils: Fix Wordless's Japanese kanji tokenizer - Work Area: Fix Wordlist Generator - Filter results - Number of syllables +- Work Area: Fix Wordlist Generator - Syllabification - Work Area: Fix Work Area - Filter results - File to filter ### ❌ Removals diff --git a/wordless/wl_nlp/wl_syl_tokenization.py b/wordless/wl_nlp/wl_syl_tokenization.py index bdc55485d..e61e975bc 100644 --- a/wordless/wl_nlp/wl_syl_tokenization.py +++ b/wordless/wl_nlp/wl_syl_tokenization.py @@ -59,7 +59,6 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False syls_tokens = [ tuple(wl_texts.clean_texts(syls)) for syls in syls_tokens - if any(syls) ] if isinstance(inputs, str): @@ -83,31 +82,33 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False return inputs def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer): - syls_tokens = [] - - for token in tokens: - # NLTK - if syl_tokenizer == 'nltk_legality': - nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality'] - - syls_tokens.append(nltk_syl_tokenizer_legality.tokenize(token)) - elif syl_tokenizer == 'nltk_sonority_sequencing': - nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing'] - - syls_tokens.append(nltk_syl_tokenizer_sonority_sequencing.tokenize(token)) - # Pyphen - elif syl_tokenizer.startswith('pyphen_'): - pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}'] - syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token)) - - if any(syls): - syls_tokens.append(syls) - else: - syls_tokens.append([token]) - # Thai - elif syl_tokenizer == 'pythainlp_han_solo': - syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo')) - elif syl_tokenizer == 'pythainlp_syl_dict': - syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'dict')) - - return syls_tokens + tokens_syls = {} + + # Syllabify types only as context information is not needed + for token in set(tokens): + if token: + # NLTK + if syl_tokenizer == 'nltk_legality': + nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality'] + + tokens_syls[token] = nltk_syl_tokenizer_legality.tokenize(token) + elif syl_tokenizer == 'nltk_sonority_sequencing': + nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing'] + + tokens_syls[token] = nltk_syl_tokenizer_sonority_sequencing.tokenize(token) + # Pyphen + elif syl_tokenizer.startswith('pyphen_'): + pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}'] + syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token)) + + if any(syls): + tokens_syls[token] = syls + else: + tokens_syls[token] = [token] + # Thai + elif syl_tokenizer == 'pythainlp_han_solo': + tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo') + elif syl_tokenizer == 'pythainlp_syl_dict': + tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'dict') + + return [tokens_syls[token] if token else [] for token in tokens]