From 3860f8ba1f52d47d261865776f6039c8324f799c Mon Sep 17 00:00:00 2001
From: BLKSerene <blkserene@gmail.com>
Date: Tue, 29 Oct 2024 16:48:51 +0800
Subject: [PATCH] Utils: Fix Wordless's Japanese kanji tokenizer

---
 CHANGELOG.md                                  |  5 +
 .../test_file_area_file_types.py              |  2 +-
 tests/tests_nlp/test_word_tokenization.py     |  2 +
 wordless/wl_nlp/wl_word_tokenization.py       | 97 +++++++------------
 4 files changed, 44 insertions(+), 62 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0b205450d..9f9faf424 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,11 @@
 
 <div align="center"><h1>📄 Changelog</h1></div>
 
+## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
+### 📌 Bugfixes
+- File Area: Fix Open Files - Encoding
+- Utils: Fix Wordless's Japanese kanji tokenizer
+
 ## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - 07/01/2024
 ### 🎉 New Features
 - File Area: Add support for .lrc and .pptx files
diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py
index 741a300d3..312327c7a 100644
--- a/tests/tests_file_area/test_file_area_file_types.py
+++ b/tests/tests_file_area/test_file_area_file_types.py
@@ -217,7 +217,7 @@ def update_gui_file_types(err_msg, new_files):
         print(file_text_tgt.lang)
         print(tokens_tgt)
 
-        # Avoid loading the French model
+        # Avoid loading spaCy's French model
         assert file_text_tgt.lang == 'eng_gb'
         assert tokens_tgt == [[[['Bonjour', 'tout', 'le', 'monde', '!']]]]
         assert tags_tgt == [None] * 5
diff --git a/tests/tests_nlp/test_word_tokenization.py b/tests/tests_nlp/test_word_tokenization.py
index 3dcaa1e93..bf199bb9c 100644
--- a/tests/tests_nlp/test_word_tokenization.py
+++ b/tests/tests_nlp/test_word_tokenization.py
@@ -25,6 +25,8 @@
 _, is_macos, _ = wl_misc.check_os()
 
 main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast')
+# Avoid loading spaCy's Japanese model when testing the Japanese kanji tokenizer
+main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn'] = 'sudachipy_jpn_split_mode_a'
 
 test_word_tokenizers = []
 test_word_tokenizers_local = []
diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py
index ba7025ae5..5caf340fd 100644
--- a/wordless/wl_nlp/wl_word_tokenization.py
+++ b/wordless/wl_nlp/wl_word_tokenization.py
@@ -16,6 +16,8 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # ----------------------------------------------------------------------
 
+import re
+
 import khmernltk
 import laonlp
 import pythainlp
@@ -114,29 +116,29 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
                             tokens_multilevel[-1].append(main.pkuseg_word_tokenizer.cut(sentence))
                     elif word_tokenizer == 'wordless_zho_char':
                         for sentence in sentences:
+                            char_scripts = ''
                             tokens = []
-                            token_temp = ''
 
                             for char in sentence:
                                 if wl_checks_tokens.is_han(char):
-                                    if token_temp:
-                                        tokens.extend(wl_word_tokenize_flat(
-                                            main, token_temp,
-                                            lang = 'other'
-                                        ))
+                                    char_scripts += 'h'
+                                else:
+                                    char_scripts += 'o'
 
-                                    token_temp = ''
+                            while sentence:
+                                len_token = len(re.search(r'^h+|^o+', char_scripts).group())
+                                token = sentence[:len_token]
 
-                                    tokens.append(char)
-                                # Other languages
-                                elif not wl_checks_tokens.is_han(char):
-                                    token_temp += char
+                                if char_scripts.startswith('h'):
+                                    tokens.extend(token)
+                                else:
+                                    tokens.extend(wl_word_tokenize_flat(
+                                        main, token,
+                                        lang = 'other'
+                                    ))
 
-                            if token_temp:
-                                tokens.extend(wl_word_tokenize_flat(
-                                    main, token_temp,
-                                    lang = 'other'
-                                ))
+                                char_scripts = char_scripts[len_token:]
+                                sentence = sentence[len_token:]
 
                             tokens_multilevel[-1].append(tokens)
                 # Japanese
@@ -160,65 +162,38 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
                     sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang)
 
                     for sentence in sentences:
+                        char_scripts = ''
                         tokens = []
-                        token_temp = ''
-                        char_last_script = 'kanji'
 
                         for char in sentence:
                             if wl_checks_tokens.is_han(char):
-                                if token_temp:
-                                    if char_last_script == 'kana':
-                                        tokens.extend(wl_word_tokenize_flat(
-                                            main, token_temp,
-                                            lang = 'jpn'
-                                        ))
-                                    elif char_last_script == 'other':
-                                        tokens.extend(wl_word_tokenize_flat(
-                                            main, token_temp,
-                                            lang = 'other'
-                                        ))
-
-                                    token_temp = ''
-
-                                tokens.append(char)
-                                char_last_script = 'kanji'
-                            # Kana
+                                char_scripts += 'h'
                             elif wl_checks_tokens.is_kana(char):
-                                if token_temp and char_last_script == 'other':
-                                    tokens.extend(wl_word_tokenize_flat(
-                                        main, token_temp,
-                                        lang = 'other'
-                                    ))
-
-                                    token_temp = ''
-
-                                token_temp += char
-                                char_last_script = 'kana'
-                            # Other languages
+                                char_scripts += 'k'
                             else:
-                                if token_temp and char_last_script == 'kana':
-                                    tokens.extend(wl_word_tokenize_flat(
-                                        main, token_temp,
-                                        lang = 'jpn'
-                                    ))
-
-                                    token_temp = ''
+                                char_scripts += 'o'
 
-                                token_temp += char
-                                char_last_script = 'other'
+                        while sentence:
+                            len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group())
+                            token = sentence[:len_token]
 
-                        if token_temp:
-                            if char_last_script == 'kana':
+                            if char_scripts.startswith('h'):
+                                tokens.extend(token)
+                            elif char_scripts.startswith('k'):
                                 tokens.extend(wl_word_tokenize_flat(
-                                    main, token_temp,
-                                    lang = 'jpn'
+                                    main, token,
+                                    lang = 'jpn',
+                                    word_tokenizer = main.settings_default['word_tokenization']['word_tokenizer_settings']['jpn']
                                 ))
-                            elif char_last_script == 'other':
+                            else:
                                 tokens.extend(wl_word_tokenize_flat(
-                                    main, token_temp,
+                                    main, token,
                                     lang = 'other'
                                 ))
 
+                            char_scripts = char_scripts[len_token:]
+                            sentence = sentence[len_token:]
+
                         tokens_multilevel[-1].append(tokens)
                 # Khmer
                 elif word_tokenizer == 'khmer_nltk_khm':