Skip to content

Commit

Permalink
Work Area: Fix Wordlist Generator - Syllabification
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jan 2, 2025
1 parent 98d849a commit 2d6662c
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 29 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- File Area: Fix Open Files - Opening Non-text Files - Do not show this again
- Utils: Fix Wordless's Japanese kanji tokenizer
- Work Area: Fix Wordlist Generator - Filter results - Number of syllables
- Work Area: Fix Wordlist Generator - Syllabification
- Work Area: Fix Work Area - Filter results - File to filter

### ❌ Removals
Expand Down
59 changes: 30 additions & 29 deletions wordless/wl_nlp/wl_syl_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False
syls_tokens = [
tuple(wl_texts.clean_texts(syls))
for syls in syls_tokens
if any(syls)
]

if isinstance(inputs, str):
Expand All @@ -83,31 +82,33 @@ def wl_syl_tokenize(main, inputs, lang, syl_tokenizer = 'default', force = False
return inputs

def wl_syl_tokenize_tokens(main, tokens, lang, syl_tokenizer):
syls_tokens = []

for token in tokens:
# NLTK
if syl_tokenizer == 'nltk_legality':
nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality']

syls_tokens.append(nltk_syl_tokenizer_legality.tokenize(token))
elif syl_tokenizer == 'nltk_sonority_sequencing':
nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing']

syls_tokens.append(nltk_syl_tokenizer_sonority_sequencing.tokenize(token))
# Pyphen
elif syl_tokenizer.startswith('pyphen_'):
pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']
syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token))

if any(syls):
syls_tokens.append(syls)
else:
syls_tokens.append([token])
# Thai
elif syl_tokenizer == 'pythainlp_han_solo':
syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo'))
elif syl_tokenizer == 'pythainlp_syl_dict':
syls_tokens.append(pythainlp.tokenize.syllable_tokenize(token, engine = 'dict'))

return syls_tokens
tokens_syls = {}

# Syllabify types only as context information is not needed
for token in set(tokens):
if token:
# NLTK
if syl_tokenizer == 'nltk_legality':
nltk_syl_tokenizer_legality = main.__dict__['nltk_syl_tokenizer_legality']

tokens_syls[token] = nltk_syl_tokenizer_legality.tokenize(token)
elif syl_tokenizer == 'nltk_sonority_sequencing':
nltk_syl_tokenizer_sonority_sequencing = main.__dict__['nltk_syl_tokenizer_sonority_sequencing']

tokens_syls[token] = nltk_syl_tokenizer_sonority_sequencing.tokenize(token)
# Pyphen
elif syl_tokenizer.startswith('pyphen_'):
pyphen_syl_tokenizer = main.__dict__[f'pyphen_syl_tokenizer_{lang}']
syls = re.split(r'\-+', pyphen_syl_tokenizer.inserted(token))

if any(syls):
tokens_syls[token] = syls
else:
tokens_syls[token] = [token]
# Thai
elif syl_tokenizer == 'pythainlp_han_solo':
tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'han_solo')
elif syl_tokenizer == 'pythainlp_syl_dict':
tokens_syls[token] = pythainlp.tokenize.syllable_tokenize(token, engine = 'dict')

return [tokens_syls[token] if token else [] for token in tokens]

0 comments on commit 2d6662c

Please sign in to comment.