diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b205450d..9f9faf424 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,11 @@

📄 Changelog

+## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 +### 📌 Bugfixes +- File Area: Fix Open Files - Encoding +- Utils: Fix Wordless's Japanese kanji tokenizer + ## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - 07/01/2024 ### 🎉 New Features - File Area: Add support for .lrc and .pptx files diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py index ba7025ae5..cbd80810d 100644 --- a/wordless/wl_nlp/wl_word_tokenization.py +++ b/wordless/wl_nlp/wl_word_tokenization.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import re + import khmernltk import laonlp import pythainlp @@ -114,29 +116,29 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): tokens_multilevel[-1].append(main.pkuseg_word_tokenizer.cut(sentence)) elif word_tokenizer == 'wordless_zho_char': for sentence in sentences: + char_scripts = '' tokens = [] - token_temp = '' for char in sentence: if wl_checks_tokens.is_han(char): - if token_temp: - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) + char_scripts += 'h' + else: + char_scripts += 'o' - token_temp = '' + while sentence: + len_token = len(re.search(r'^h+|^o+', char_scripts).group()) + token = sentence[:len_token] - tokens.append(char) - # Other languages - elif not wl_checks_tokens.is_han(char): - token_temp += char + if char_scripts.startswith('h'): + tokens.extend(token) + else: + tokens.extend(wl_word_tokenize_flat( + main, token, + lang = 'other' + )) - if token_temp: - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) + char_scripts = char_scripts[len_token:] + sentence = sentence[len_token:] tokens_multilevel[-1].append(tokens) # Japanese @@ -160,65 +162,38 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang) for sentence in sentences: + char_scripts = '' tokens = [] - token_temp = '' - char_last_script = 'kanji' for char in sentence: if wl_checks_tokens.is_han(char): - if token_temp: - if char_last_script == 'kana': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' - )) - elif char_last_script == 'other': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) - - token_temp = '' - - tokens.append(char) - char_last_script = 'kanji' - # Kana + char_scripts += 'h' elif wl_checks_tokens.is_kana(char): - if token_temp and char_last_script == 'other': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'other' - )) - - token_temp = '' - - token_temp += char - char_last_script = 'kana' - # Other languages + char_scripts += 'k' else: - if token_temp and char_last_script == 'kana': - tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' - )) - - token_temp = '' + char_scripts += 'o' - token_temp += char - char_last_script = 'other' + while sentence: + len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group()) + token = sentence[:len_token] - if token_temp: - if char_last_script == 'kana': + if char_scripts.startswith('h'): + tokens.extend(token) + elif char_scripts.startswith('k'): tokens.extend(wl_word_tokenize_flat( - main, token_temp, - lang = 'jpn' + main, token, + lang = 'jpn', + word_tokenizer = 'spacy_jpn' )) - elif char_last_script == 'other': + else: tokens.extend(wl_word_tokenize_flat( - main, token_temp, + main, token, lang = 'other' )) + char_scripts = char_scripts[len_token:] + sentence = sentence[len_token:] + tokens_multilevel[-1].append(tokens) # Khmer elif word_tokenizer == 'khmer_nltk_khm':