diff --git a/ACKS.md b/ACKS.md
index 90265afcf..52aafd03d 100644
--- a/ACKS.md
+++ b/ACKS.md
@@ -30,7 +30,7 @@ As Wordless stands on the shoulders of giants, I hereby extend my sincere gratit
6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt)
7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib Development Team|[Matplotlib](https://matplotlib.org/stable/users/project/license.html)
8|[NetworkX](https://networkx.org/)|3.3|NetworkX Developers, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
-9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
+9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy Developers|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|Carbo Kuo (郭家宝), Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt)
12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 56390865d..bc0be016c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,8 +38,10 @@
### ❌ Removals
- Measures: Remove effect size - Log-frequency biased MD / Mutual Dependency
+- Utils: Remove NLTK's Malayalam Punkt sentence tokenizer
### ⏫ Dependency Changes
+- Dependencies: Upgrade NLTK to 3.9.1
- Dependencies: Upgrade Pyphen to 0.17.0
- Dependencies: Upgrade python-mecab-ko to 1.3.7
- Dependencies: Upgrade python-pptx to 1.0.2
diff --git a/doc/trs/zho_cn/ACKS.md b/doc/trs/zho_cn/ACKS.md
index 75f08e535..45491f251 100644
--- a/doc/trs/zho_cn/ACKS.md
+++ b/doc/trs/zho_cn/ACKS.md
@@ -30,7 +30,7 @@
6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt)
7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib 开发团队|[Matplotlib](https://matplotlib.org/stable/users/project/license.html)
8|[NetworkX](https://networkx.org/)|3.3|NetworkX 开发人员, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
-9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
+9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy 开发人员|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|郭家宝, Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt)
12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst)
diff --git a/doc/trs/zho_tw/ACKS.md b/doc/trs/zho_tw/ACKS.md
index 1eb34d222..eaa7a3b62 100644
--- a/doc/trs/zho_tw/ACKS.md
+++ b/doc/trs/zho_tw/ACKS.md
@@ -30,7 +30,7 @@
6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt)
7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib 开发团队|[Matplotlib](https://matplotlib.org/stable/users/project/license.html)
8|[NetworkX](https://networkx.org/)|3.3|NetworkX 开发人员, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt)
-9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
+9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy 开发人员|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt)
11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|郭家宝, Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt)
12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst)
diff --git a/requirements/requirements_tests.txt b/requirements/requirements_tests.txt
index 6c29c9fbd..827c01ff5 100644
--- a/requirements/requirements_tests.txt
+++ b/requirements/requirements_tests.txt
@@ -23,7 +23,7 @@ charset-normalizer == 3.3.2
khmer-nltk == 1.6
laonlp == 1.2.0
lingua-language-detector == 2.0.2
-nltk == 3.8.1
+nltk == 3.9.1
pyphen == 0.17.0
pythainlp == 5.0.4
sacremoses == 0.1.1
diff --git a/tests/tests_nlp/test_word_tokenization.py b/tests/tests_nlp/test_word_tokenization.py
index bf199bb9c..8eb43ad99 100644
--- a/tests/tests_nlp/test_word_tokenization.py
+++ b/tests/tests_nlp/test_word_tokenization.py
@@ -330,7 +330,7 @@ def test_word_tokenize(lang, word_tokenizer):
case 'vie':
match word_tokenizer:
case 'nltk_tok_tok':
- assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.']
+ assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam', '[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.']
case 'underthesea_vie':
assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi là', 'tiếng', 'Việt Nam', '[', '9', ']', 'hay', 'Việt ngữ', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.']
case _:
diff --git a/tests/tests_settings/test_settings_global.py b/tests/tests_settings/test_settings_global.py
index 456b35167..4fc803d80 100644
--- a/tests/tests_settings/test_settings_global.py
+++ b/tests/tests_settings/test_settings_global.py
@@ -20,6 +20,7 @@
import pkgutil
import re
+import nltk
import requests
import sacremoses
import simplemma
@@ -142,15 +143,65 @@ def check_settings_global(self):
stop_word_lists.append('custom')
# NLTK
+ langs_nltk_sentence_tokenizers_supported = []
+ langs_nltk_punkt_sentence_tokenizers = []
langs_nltk_word_tokenizers = []
+ for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab/')):
+ match lang:
+ case 'english':
+ langs_nltk_sentence_tokenizers_supported.extend(['en_gb', 'en_us'])
+ case 'german':
+ langs_nltk_sentence_tokenizers_supported.extend(['de_at', 'de_de', 'de_ch'])
+ case 'greek':
+ langs_nltk_sentence_tokenizers_supported.append('el')
+ case 'norwegian':
+ langs_nltk_sentence_tokenizers_supported.append('nb')
+ case 'portuguese':
+ langs_nltk_sentence_tokenizers_supported.extend(['pt_br', 'pt_pt'])
+ case 'README':
+ pass
+ case _:
+ langs_nltk_sentence_tokenizers_supported.append(wl_conversion.to_lang_code(
+ main,
+ lang.capitalize(),
+ iso_639_3 = False
+ ))
+
+ for lang_code, sentence_tokenizers in settings_sentence_tokenizers.items():
+ if (
+ lang_code != 'other'
+ and any((
+ sentence_tokenizer.startswith('nltk_punkt_')
+ for sentence_tokenizer in sentence_tokenizers
+ ))
+ ):
+ langs_nltk_punkt_sentence_tokenizers.append(lang_code)
+
+ self.check_missing_extra_langs(
+ langs_nltk_sentence_tokenizers_supported,
+ langs_nltk_punkt_sentence_tokenizers,
+ "NLTK's Punkt sentence tokenizer"
+ )
+
for lang_code, word_tokenizers in settings_word_tokenizers.items():
- if lang_code != 'other' and any(('nltk_nltk' in word_tokenizer for word_tokenizer in word_tokenizers)):
+ if (
+ lang_code != 'other'
+ and any((
+ 'nltk_nltk' in word_tokenizer
+ for word_tokenizer in word_tokenizers
+ ))
+ ):
langs_nltk_word_tokenizers.append(lang_code)
for lang_code in settings_word_tokenizers:
if lang_code != 'other':
- if lang_code not in ['amh', 'mya', 'lzh', 'zho_cn', 'zho_tw', 'jpn', 'khm', 'lao', 'tha', 'bod', 'vie']:
+ # Exclude languages without spaces between words
+ if lang_code not in [
+ 'amh', 'mya', 'lzh', 'zho_cn', 'zho_tw',
+ 'jpn', 'khm', 'lao', 'tha', 'bod',
+ 'vie'
+ ]:
if lang_code not in langs_nltk_word_tokenizers:
print(f'''Missing language code "{lang_code}" found for NLTK's tokenizers!''')
@@ -174,10 +225,20 @@ def check_settings_global(self):
langs_sacremoses_supported = add_lang_suffixes(langs_sacremoses_supported)
for lang_code, word_tokenizers in settings_word_tokenizers.items():
- if lang_code != 'other' and any(('sacremoses' in word_tokenizer for word_tokenizer in word_tokenizers)):
+ if (
+ lang_code != 'other'
+ and any((
+ 'sacremoses' in word_tokenizer
+ for word_tokenizer in word_tokenizers
+ ))
+ ):
langs_sacremoses_moses_tokenizer.append(lang_code)
- self.check_missing_extra_langs(langs_sacremoses_supported, langs_sacremoses_moses_tokenizer, "Sacremoses's Moses tokenizer")
+ self.check_missing_extra_langs(
+ langs_sacremoses_supported,
+ langs_sacremoses_moses_tokenizer,
+ "Sacremoses's Moses tokenizer"
+ )
# simplemma
langs_simplemma_supported = []
@@ -193,10 +254,17 @@ def check_settings_global(self):
langs_simplemma_supported = add_lang_suffixes(langs_simplemma_supported)
for lang_code, lemmatizers in settings_lemmatizers.items():
- if any((lemmatizer.startswith('simplemma_') for lemmatizer in lemmatizers)):
+ if any((
+ lemmatizer.startswith('simplemma_')
+ for lemmatizer in lemmatizers
+ )):
langs_simplemma_lemmatizers.append(lang_code)
- self.check_missing_extra_langs(langs_simplemma_supported, langs_simplemma_lemmatizers, "simplemma's lemmatizers")
+ self.check_missing_extra_langs(
+ langs_simplemma_supported,
+ langs_simplemma_lemmatizers,
+ "simplemma's lemmatizers"
+ )
# spaCy
langs_spacy_supported = []
@@ -228,7 +296,10 @@ def check_settings_global(self):
for lang_code, sentence_tokenizers in settings_sentence_tokenizers.items():
if (
lang_code not in ['khm', 'tha', 'bod', 'vie']
- and not any((sentence_tokenizer.startswith('spacy_') for sentence_tokenizer in sentence_tokenizers))
+ and not any((
+ sentence_tokenizer.startswith('spacy_')
+ for sentence_tokenizer in sentence_tokenizers
+ ))
):
lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code)
@@ -237,13 +308,25 @@ def check_settings_global(self):
self.lang_utils_missing = True
for lang_code, word_tokenizers in settings_word_tokenizers.items():
- if lang_code != 'other' and any(('spacy' in word_tokenizer for word_tokenizer in word_tokenizers)):
+ if (
+ lang_code != 'other'
+ and any((
+ 'spacy' in word_tokenizer
+ for word_tokenizer in word_tokenizers
+ ))
+ ):
langs_spacy_word_tokenizers.append(lang_code)
self.check_missing_extra_langs(langs_spacy_supported, langs_spacy_word_tokenizers, "spaCy's word tokenizers")
for lang_code, lemmatizers in settings_lemmatizers.items():
- if lang_code != 'other' and any(('spacy' in lemmatizer for lemmatizer in lemmatizers)):
+ if (
+ lang_code != 'other'
+ and any((
+ 'spacy' in lemmatizer
+ for lemmatizer in lemmatizers
+ ))
+ ):
langs_spacy_lemmatizers.append(lang_code)
self.check_missing_extra_langs(langs_spacy_supported_lemmatizers, langs_spacy_lemmatizers, "spaCy's lemmatizers")
@@ -271,7 +354,10 @@ def check_settings_global(self):
break
- r = requests.get(f'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_{ver_stanza}.json', timeout = 10)
+ r = requests.get(
+ f'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_{ver_stanza}.json',
+ timeout = 10
+ )
for lang, lang_resources in r.json().items():
if lang != 'multilingual' and 'default_processors' in lang_resources:
@@ -331,7 +417,13 @@ def check_settings_global(self):
(settings_sentiment_analyzers, langs_stanza_sentiment_analyzers, langs_stanza_supported_sentiment_analyzers, 'sentiment analyzer')
]:
for lang_code, lang_utils in settings_lang_utils.items():
- if lang_code != 'other' and any(('stanza' in lang_util for lang_util in lang_utils)):
+ if (
+ lang_code != 'other'
+ and any((
+ 'stanza' in lang_util
+ for lang_util in lang_utils
+ ))
+ ):
langs.append(lang_code)
self.check_missing_extra_langs(langs_supported, langs, f"Stanza's {msg_lang_util}")
diff --git a/tests/tests_utils/test_conversion.py b/tests/tests_utils/test_conversion.py
index 27766a814..5801f1057 100644
--- a/tests/tests_utils/test_conversion.py
+++ b/tests/tests_utils/test_conversion.py
@@ -40,15 +40,17 @@ def test_normalize_lang_code():
for lang_code in settings_langs.values():
assert wl_conversion.normalize_lang_code(lang_code[0].replace('_', '-').upper()) == lang_code[0]
-
def test_to_lang_code():
for lang_text, lang_code in settings_langs.items():
assert wl_conversion.to_lang_code(main, lang_text) == lang_code[0]
+ assert wl_conversion.to_lang_code(main, lang_text, iso_639_3 = False) == lang_code[1]
def test_to_lang_codes():
- lang_codes = wl_conversion.to_lang_codes(main, settings_langs.keys())
+ lang_codes_639_3 = wl_conversion.to_lang_codes(main, settings_langs.keys())
+ lang_codes_639_1 = wl_conversion.to_lang_codes(main, settings_langs.keys(), iso_639_3 = False)
- assert list(lang_codes) == [lang_vals[0] for lang_vals in settings_langs.values()]
+ assert list(lang_codes_639_3) == [lang_vals[0] for lang_vals in settings_langs.values()]
+ assert list(lang_codes_639_1) == [lang_vals[1] for lang_vals in settings_langs.values()]
def test_to_lang_text():
for lang_code in TO_LANG_TEXT.keys():
diff --git a/utils/wl_downloader_ci.py b/utils/wl_downloader_ci.py
index 536c63233..dde59372a 100644
--- a/utils/wl_downloader_ci.py
+++ b/utils/wl_downloader_ci.py
@@ -39,18 +39,12 @@ def run_cli(commands):
subprocess.run(['python', '-m'] + commands, check = True)
# Download NLTK data
-# Corpora
-nltk.download('omw-1.4')
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('averaged_perceptron_tagger_rus')
+nltk.download('perluniprops')
+nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
-nltk.download('words')
-# Taggers
-nltk.download('averaged_perceptron_tagger')
-nltk.download('averaged_perceptron_tagger_ru')
-# Tokenizers
-nltk.download('punkt')
-# Misc
-nltk.download('perluniprops')
# Download models
spacy.cli.download('en_core_web_trf')
diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py
index 046d36cf7..1592bf93c 100644
--- a/wordless/wl_settings/wl_settings_default.py
+++ b/wordless/wl_settings/wl_settings_default.py
@@ -1350,7 +1350,7 @@ def init_settings_default(main):
'lij': 'stanza_lij',
'lit': 'spacy_dependency_parser_lit',
'mkd': 'spacy_dependency_parser_mkd',
- 'mal': 'nltk_punkt_mal',
+ 'mal': 'spacy_sentencizer',
'mlt': 'stanza_mlt',
'glv': 'stanza_glv',
'mar': 'stanza_mar',
diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py
index 55be2f4f5..904104474 100644
--- a/wordless/wl_settings/wl_settings_global.py
+++ b/wordless/wl_settings/wl_settings_global.py
@@ -398,7 +398,6 @@ def init_settings_global():
_tr('wl_settings_global', 'NLTK - German Punkt sentence tokenizer'): 'nltk_punkt_deu',
_tr('wl_settings_global', 'NLTK - Greek Punkt sentence tokenizer'): 'nltk_punkt_ell',
_tr('wl_settings_global', 'NLTK - Italian Punkt sentence tokenizer'): 'nltk_punkt_ita',
- _tr('wl_settings_global', 'NLTK - Malayalam Punkt sentence tokenizer'): 'nltk_punkt_mal',
_tr('wl_settings_global', 'NLTK - Norwegian (Bokmål) Punkt sentence tokenizer'): 'nltk_punkt_nob',
_tr('wl_settings_global', 'NLTK - Polish Punkt sentence tokenizer'): 'nltk_punkt_pol',
_tr('wl_settings_global', 'NLTK - Portuguese Punkt sentence tokenizer'): 'nltk_punkt_por',
@@ -1639,10 +1638,7 @@ def init_settings_global():
'spacy_sentencizer'
],
- 'mal': [
- 'nltk_punkt_mal',
- 'spacy_sentencizer'
- ],
+ 'mal': ['spacy_sentencizer'],
'mlt': [
'spacy_sentencizer',
diff --git a/wordless/wl_utils/wl_conversion.py b/wordless/wl_utils/wl_conversion.py
index 10712587a..ef301b04d 100644
--- a/wordless/wl_utils/wl_conversion.py
+++ b/wordless/wl_utils/wl_conversion.py
@@ -26,11 +26,23 @@
def normalize_lang_code(lang_code):
return lang_code.replace('-', '_').lower()
-def to_lang_code(main, lang_text):
- return main.settings_global['langs'][lang_text][0]
-
-def to_lang_codes(main, lang_texts):
- return (main.settings_global['langs'][lang_text][0] for lang_text in lang_texts)
+def to_lang_code(main, lang_text, iso_639_3 = True):
+ if iso_639_3:
+ return main.settings_global['langs'][lang_text][0]
+ else:
+ return main.settings_global['langs'][lang_text][1]
+
+def to_lang_codes(main, lang_texts, iso_639_3 = True):
+ if iso_639_3:
+ return (
+ main.settings_global['langs'][lang_text][0]
+ for lang_text in lang_texts
+ )
+ else:
+ return (
+ main.settings_global['langs'][lang_text][1]
+ for lang_text in lang_texts
+ )
def to_lang_text(main, lang_code):
lang_code = normalize_lang_code(lang_code)