Skip to content

Commit

Permalink
Utils: Fix Wordless's Japanese kanji tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Oct 29, 2024
1 parent 56f91ca commit 35577a4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 61 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

<div align="center"><h1>📄 Changelog</h1></div>

## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
### 📌 Bugfixes
- File Area: Fix Open Files - Encoding
- Utils: Fix Wordless's Japanese kanji tokenizer

## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - 07/01/2024
### 🎉 New Features
- File Area: Add support for .lrc and .pptx files
Expand Down
97 changes: 36 additions & 61 deletions wordless/wl_nlp/wl_word_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------

import re

import khmernltk
import laonlp
import pythainlp
Expand Down Expand Up @@ -114,29 +116,29 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
tokens_multilevel[-1].append(main.pkuseg_word_tokenizer.cut(sentence))
elif word_tokenizer == 'wordless_zho_char':
for sentence in sentences:
char_scripts = ''
tokens = []
token_temp = ''

for char in sentence:
if wl_checks_tokens.is_han(char):
if token_temp:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))
char_scripts += 'h'
else:
char_scripts += 'o'

token_temp = ''
while sentence:
len_token = len(re.search(r'^h+|^o+', char_scripts).group())
token = sentence[:len_token]

tokens.append(char)
# Other languages
elif not wl_checks_tokens.is_han(char):
token_temp += char
if char_scripts.startswith('h'):
tokens.extend(token)
else:
tokens.extend(wl_word_tokenize_flat(
main, token,
lang = 'other'
))

if token_temp:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))
char_scripts = char_scripts[len_token:]
sentence = sentence[len_token:]

tokens_multilevel[-1].append(tokens)
# Japanese
Expand All @@ -160,65 +162,38 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'):
sentences = wl_sentence_tokenization.wl_sentence_tokenize(main, line, lang = lang)

for sentence in sentences:
char_scripts = ''
tokens = []
token_temp = ''
char_last_script = 'kanji'

for char in sentence:
if wl_checks_tokens.is_han(char):
if token_temp:
if char_last_script == 'kana':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
))
elif char_last_script == 'other':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))

token_temp = ''

tokens.append(char)
char_last_script = 'kanji'
# Kana
char_scripts += 'h'
elif wl_checks_tokens.is_kana(char):
if token_temp and char_last_script == 'other':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'other'
))

token_temp = ''

token_temp += char
char_last_script = 'kana'
# Other languages
char_scripts += 'k'
else:
if token_temp and char_last_script == 'kana':
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
))

token_temp = ''
char_scripts += 'o'

token_temp += char
char_last_script = 'other'
while sentence:
len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group())
token = sentence[:len_token]

if token_temp:
if char_last_script == 'kana':
if char_scripts.startswith('h'):
tokens.extend(token)
elif char_scripts.startswith('k'):
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
lang = 'jpn'
main, token,
lang = 'jpn',
word_tokenizer = 'spacy_jpn'
))
elif char_last_script == 'other':
else:
tokens.extend(wl_word_tokenize_flat(
main, token_temp,
main, token,
lang = 'other'
))

char_scripts = char_scripts[len_token:]
sentence = sentence[len_token:]

tokens_multilevel[-1].append(tokens)
# Khmer
elif word_tokenizer == 'khmer_nltk_khm':
Expand Down

0 comments on commit 35577a4

Please sign in to comment.