Skip to content

Commit

Permalink
Utils: Fix Wordless's Japanese kanji tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Oct 29, 2024
1 parent 56f91ca commit 3860f8b
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 62 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

<div align="center"><h1>📄 Changelog</h1></div>

## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
### 📌 Bugfixes
- File Area: Fix Open Files - Encoding
- Utils: Fix Wordless's Japanese kanji tokenizer

## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - 07/01/2024
### 🎉 New Features
- File Area: Add support for .lrc and .pptx files
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_file_area/test_file_area_file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def update_gui_file_types(err_msg, new_files):
print(file_text_tgt.lang)
print(tokens_tgt)

# Avoid loading the French model
# Avoid loading spaCy's French model
assert file_text_tgt.lang == 'eng_gb'
assert tokens_tgt == [[[['Bonjour', 'tout', 'le', 'monde', '!']]]]
assert tags_tgt == [None] * 5
Expand Down
2 changes: 2 additions & 0 deletions tests/tests_nlp/test_word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
_, is_macos, _ = wl_misc.check_os()

main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
# Avoid loading spaCy's Japanese model when testing the Japanese kanji tokenizer
main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn'] = 'sudachipy_jpn_split_mode_a'

test_word_tokenizers = []
test_word_tokenizers_local = []
Expand Down
97 changes: 36 additions & 61 deletions wordless/wl_nlp/wl_word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import re

import khmernltk
import laonlp
import pythainlp
Expand Down Expand Up @@ -114,29 +116,29 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
tokens_multilevel[-1].append(main.pkuseg_word_tokenizer.cut(sentence))
elif word_tokenizer == 'wordless_zho_char':
for sentence in sentences:
char_scripts = ''
tokens = []
token_temp = ''

for char in sentence:
if wl_checks_tokens.is_han(char):
if token_temp:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))
char_scripts += 'h'
else:
char_scripts += 'o'

token_temp = ''
while sentence:
len_token = len(re.search(r'^h+|^o+', char_scripts).group())
token = sentence[:len_token]

tokens.append(char)
# Other languages
elif not wl_checks_tokens.is_han(char):
token_temp += char
if char_scripts.startswith('h'):
tokens.extend(token)
else:
tokens.extend(wl_word_tokenize_flat(
main, token,
lang = 'other'
))

if token_temp:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))
char_scripts = char_scripts[len_token:]
sentence = sentence[len_token:]

tokens_multilevel[-1].append(tokens)
# Japanese
Expand All @@ -160,65 +162,38 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang)

for sentence in sentences:
char_scripts = ''
tokens = []
token_temp = ''
char_last_script = 'kanji'

for char in sentence:
if wl_checks_tokens.is_han(char):
if token_temp:
if char_last_script == 'kana':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
))
elif char_last_script == 'other':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))

token_temp = ''

tokens.append(char)
char_last_script = 'kanji'
# Kana
char_scripts += 'h'
elif wl_checks_tokens.is_kana(char):
if token_temp and char_last_script == 'other':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))

token_temp = ''

token_temp += char
char_last_script = 'kana'
# Other languages
char_scripts += 'k'
else:
if token_temp and char_last_script == 'kana':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
))

token_temp = ''
char_scripts += 'o'

token_temp += char
char_last_script = 'other'
while sentence:
len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group())
token = sentence[:len_token]

if token_temp:
if char_last_script == 'kana':
if char_scripts.startswith('h'):
tokens.extend(token)
elif char_scripts.startswith('k'):
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
main, token,
lang = 'jpn',
word_tokenizer = main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn']
))
elif char_last_script == 'other':
else:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
main, token,
lang = 'other'
))

char_scripts = char_scripts[len_token:]
sentence = sentence[len_token:]

tokens_multilevel[-1].append(tokens)
# Khmer
elif word_tokenizer == 'khmer_nltk_khm':
Expand Down

0 comments on commit 3860f8b

Please sign in to comment.