Work Area: Fix Wordlist Generator - Syllabification

BLKSerene · Jan 2, 2025 · 2d6662c · 2d6662c
1 parent 98d849a
commit 2d6662c
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,6 +32,7 @@
 - File Area: Fix Open Files - Opening Non-text Files - Do not show this again
 - Utils: Fix Wordless's Japanese kanji tokenizer
 - Work Area: Fix Wordlist Generator - Filter results - Number of syllables
+- Work Area: Fix Wordlist Generator - Syllabification
 - Work Area: Fix Work Area - Filter results - File to filter
 
 ### ❌ Removals

diff --git a/wordless/wl_nlp/wl_syl_tokenization.py b/wordless/wl_nlp/wl_syl_tokenization.py
@@ -59,7 +59,6 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False
             syls_tokens = [
                 tuple(wl_texts.clean_texts(syls))
                 for syls in syls_tokens
-                if any(syls)
             ]
 
             if isinstance(inputs, str):
@@ -83,31 +82,33 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False
                 return inputs
 
 def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer):
-    syls_tokens = []
-
-    for token in tokens:
-        # NLTK
-        if syl_tokenizer == 'nltk_legality':
-            nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality']
-
-            syls_tokens.append(nltk_syl_tokenizer_legality.tokenize(token))
-        elif syl_tokenizer == 'nltk_sonority_sequencing':
-            nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing']
-
-            syls_tokens.append(nltk_syl_tokenizer_sonority_sequencing.tokenize(token))
-        # Pyphen
-        elif syl_tokenizer.startswith('pyphen_'):
-            pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']
-            syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token))
-
-            if any(syls):
-                syls_tokens.append(syls)
-            else:
-                syls_tokens.append([token])
-        # Thai
-        elif syl_tokenizer == 'pythainlp_han_solo':
-            syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo'))
-        elif syl_tokenizer == 'pythainlp_syl_dict':
-            syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'dict'))
-
-    return syls_tokens
+    tokens_syls = {}
+
+    # Syllabify types only as context information is not needed
+    for token in set(tokens):
+        if token:
+            # NLTK
+            if syl_tokenizer == 'nltk_legality':
+                nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality']
+
+                tokens_syls[token] = nltk_syl_tokenizer_legality.tokenize(token)
+            elif syl_tokenizer == 'nltk_sonority_sequencing':
+                nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing']
+
+                tokens_syls[token] = nltk_syl_tokenizer_sonority_sequencing.tokenize(token)
+            # Pyphen
+            elif syl_tokenizer.startswith('pyphen_'):
+                pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']
+                syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token))
+
+                if any(syls):
+                    tokens_syls[token] = syls
+                else:
+                    tokens_syls[token] = [token]
+            # Thai
+            elif syl_tokenizer == 'pythainlp_han_solo':
+                tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo')
+            elif syl_tokenizer == 'pythainlp_syl_dict':
+                tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'dict')
+
+    return [tokens_syls[token] if token else [] for token in tokens]