From 3860f8ba1f52d47d261865776f6039c8324f799c Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Tue, 29 Oct 2024 16:48:51 +0800 Subject: [PATCH] Utils: Fix Wordless's Japanese kanji tokenizer --- CHANGELOG.md | 5 + .../test_file_area_file_types.py | 2 +- tests/tests_nlp/test_word_tokenization.py | 2 + wordless/wl_nlp/wl_word_tokenization.py | 97 +++++++------------ 4 files changed, 44 insertions(+), 62 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b205450d..9f9faf424 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,11 @@

📄 Changelog

+## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 +### 📌 Bugfixes +- File Area: Fix Open Files - Encoding +- Utils: Fix Wordless's Japanese kanji tokenizer + ## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - 07/01/2024 ### 🎉 New Features - File Area: Add support for .lrc and .pptx files diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py index 741a300d3..312327c7a 100644 --- a/tests/tests_file_area/test_file_area_file_types.py +++ b/tests/tests_file_area/test_file_area_file_types.py @@ -217,7 +217,7 @@ def update_gui_file_types(err_msg, new_files): print(file_text_tgt.lang) print(tokens_tgt) - # Avoid loading the French model + # Avoid loading spaCy's French model assert file_text_tgt.lang == 'eng_gb' assert tokens_tgt == [[[['Bonjour', 'tout', 'le', 'monde', '!']]]] assert tags_tgt == [None] * 5 diff --git a/tests/tests_nlp/test_word_tokenization.py b/tests/tests_nlp/test_word_tokenization.py index 3dcaa1e93..bf199bb9c 100644 --- a/tests/tests_nlp/test_word_tokenization.py +++ b/tests/tests_nlp/test_word_tokenization.py @@ -25,6 +25,8 @@ _, is_macos, _ = wl_misc.check_os() main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') +# Avoid loading spaCy's Japanese model when testing the Japanese kanji tokenizer +main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn'] = 'sudachipy_jpn_split_mode_a' test_word_tokenizers = [] test_word_tokenizers_local = [] diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py index ba7025ae5..5caf340fd 100644 --- a/wordless/wl_nlp/wl_word_tokenization.py +++ b/wordless/wl_nlp/wl_word_tokenization.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import re + import khmernltk import laonlp import pythainlp @@ -114,29 +116,29 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): tokens_multilevel[-1].append(main.pkuseg_word_tokenizer.cut(sentence)) elif word_tokenizer == 'wordless_zho_char': for sentence in sentences: + char_scripts = '' tokens = [] - token_temp = '' for char in sentence: if wl_checks_tokens.is_han(char): - if token_temp: - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) + char_scripts += 'h' + else: + char_scripts += 'o' - token_temp = '' + while sentence: + len_token = len(re.search(r'^h+|^o+', char_scripts).group()) + token = sentence[:len_token] - tokens.append(char) - # Other languages - elif not wl_checks_tokens.is_han(char): - token_temp += char + if char_scripts.startswith('h'): + tokens.extend(token) + else: + tokens.extend(wl_word_tokenize_flat( + main, token, + lang = 'other' + )) - if token_temp: - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) + char_scripts = char_scripts[len_token:] + sentence = sentence[len_token:] tokens_multilevel[-1].append(tokens) # Japanese @@ -160,65 +162,38 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang) for sentence in sentences: + char_scripts = '' tokens = [] - token_temp = '' - char_last_script = 'kanji' for char in sentence: if wl_checks_tokens.is_han(char): - if token_temp: - if char_last_script == 'kana': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' - )) - elif char_last_script == 'other': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) - - token_temp = '' - - tokens.append(char) - char_last_script = 'kanji' - # Kana + char_scripts += 'h' elif wl_checks_tokens.is_kana(char): - if token_temp and char_last_script == 'other': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) - - token_temp = '' - - token_temp += char - char_last_script = 'kana' - # Other languages + char_scripts += 'k' else: - if token_temp and char_last_script == 'kana': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' - )) - - token_temp = '' + char_scripts += 'o' - token_temp += char - char_last_script = 'other' + while sentence: + len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group()) + token = sentence[:len_token] - if token_temp: - if char_last_script == 'kana': + if char_scripts.startswith('h'): + tokens.extend(token) + elif char_scripts.startswith('k'): tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' + main, token, + lang = 'jpn', + word_tokenizer = main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn'] )) - elif char_last_script == 'other': + else: tokens.extend(wl_word_tokenize_flat( - main, token_temp, + main, token, lang = 'other' )) + char_scripts = char_scripts[len_token:] + sentence = sentence[len_token:] + tokens_multilevel[-1].append(tokens) # Khmer elif word_tokenizer == 'khmer_nltk_khm':