diff --git a/ACKS.md b/ACKS.md index 90265afcf..52aafd03d 100644 --- a/ACKS.md +++ b/ACKS.md @@ -30,7 +30,7 @@ As Wordless stands on the shoulders of giants, I hereby extend my sincere gratit 6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt) 7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib Development Team|[Matplotlib](https://matplotlib.org/stable/users/project/license.html) 8|[NetworkX](https://networkx.org/)|3.3|NetworkX Developers, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt) -9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) +9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) 10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy Developers|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt) 11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|Carbo Kuo (郭家宝), Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt) 12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56390865d..bc0be016c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,8 +38,10 @@ ### ❌ Removals - Measures: Remove effect size - Log-frequency biased MD / Mutual Dependency +- Utils: Remove NLTK's Malayalam Punkt sentence tokenizer ### ⏫ Dependency Changes +- Dependencies: Upgrade NLTK to 3.9.1 - Dependencies: Upgrade Pyphen to 0.17.0 - Dependencies: Upgrade python-mecab-ko to 1.3.7 - Dependencies: Upgrade python-pptx to 1.0.2 diff --git a/doc/trs/zho_cn/ACKS.md b/doc/trs/zho_cn/ACKS.md index 75f08e535..45491f251 100644 --- a/doc/trs/zho_cn/ACKS.md +++ b/doc/trs/zho_cn/ACKS.md @@ -30,7 +30,7 @@ 6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt) 7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib 开发团队|[Matplotlib](https://matplotlib.org/stable/users/project/license.html) 8|[NetworkX](https://networkx.org/)|3.3|NetworkX 开发人员, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt) -9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) +9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) 10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy 开发人员|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt) 11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|郭家宝, Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt) 12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst) diff --git a/doc/trs/zho_tw/ACKS.md b/doc/trs/zho_tw/ACKS.md index 1eb34d222..eaa7a3b62 100644 --- a/doc/trs/zho_tw/ACKS.md +++ b/doc/trs/zho_tw/ACKS.md @@ -30,7 +30,7 @@ 6|[Lingua](https://github.com/pemistahl/lingua-py)|2.0.2|Peter M. Stahl|[Apache-2.0](https://github.com/pemistahl/lingua-py/blob/main/LICENSE.txt) 7|[Matplotlib](https://matplotlib.org/)|3.9.0|Matplotlib 开发团队|[Matplotlib](https://matplotlib.org/stable/users/project/license.html) 8|[NetworkX](https://networkx.org/)|3.3|NetworkX 开发人员, Aric Hagberg, Dan Schult,
Pieter Swart|[BSD-3-Clause](https://github.com/networkx/networkx/blob/main/LICENSE.txt) -9|[NLTK](https://www.nltk.org/)|3.8.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) +9|[NLTK](https://www.nltk.org/)|3.9.1|Steven Bird, Edward Loper, Ewan Klein|[Apache-2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt) 10|[NumPy](https://www.numpy.org/)|1.26.4|NumPy 开发人员|[BSD-3-Clause](https://github.com/numpy/numpy/blob/main/LICENSE.txt) 11|[opencc-python](https://github.com/yichen0831/opencc-python)|0.1.7|郭家宝, Yicheng Huang|[Apache-2.0](https://github.com/yichen0831/opencc-python/blob/master/LICENSE.txt) 12|[openpyxl](https://foss.heptapod.net/openpyxl/openpyxl)|3.1.5|Eric Gazoni, Charlie Clark|[MIT](https://foss.heptapod.net/openpyxl/openpyxl/-/blob/branch/3.1/LICENCE.rst) diff --git a/requirements/requirements_tests.txt b/requirements/requirements_tests.txt index 6c29c9fbd..827c01ff5 100644 --- a/requirements/requirements_tests.txt +++ b/requirements/requirements_tests.txt @@ -23,7 +23,7 @@ charset-normalizer == 3.3.2 khmer-nltk == 1.6 laonlp == 1.2.0 lingua-language-detector == 2.0.2 -nltk == 3.8.1 +nltk == 3.9.1 pyphen == 0.17.0 pythainlp == 5.0.4 sacremoses == 0.1.1 diff --git a/tests/tests_nlp/test_word_tokenization.py b/tests/tests_nlp/test_word_tokenization.py index bf199bb9c..8eb43ad99 100644 --- a/tests/tests_nlp/test_word_tokenization.py +++ b/tests/tests_nlp/test_word_tokenization.py @@ -330,7 +330,7 @@ def test_word_tokenize(lang, word_tokenizer): case 'vie': match word_tokenizer: case 'nltk_tok_tok': - assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.'] + assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi', 'là', 'tiếng', 'Việt', 'Nam', '[', '9', ']', 'hay', 'Việt', 'ngữ', 'là', 'ngôn', 'ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn', 'ngữ', 'chính', 'thức', 'tại', 'Việt', 'Nam', '.'] case 'underthesea_vie': assert tokens == ['Tiếng', 'Việt', ',', 'cũng', 'gọi là', 'tiếng', 'Việt Nam', '[', '9', ']', 'hay', 'Việt ngữ', 'là', 'ngôn ngữ', 'của', 'người', 'Việt', 'và', 'là', 'ngôn ngữ', 'chính thức', 'tại', 'Việt Nam', '.'] case _: diff --git a/tests/tests_settings/test_settings_global.py b/tests/tests_settings/test_settings_global.py index 456b35167..4fc803d80 100644 --- a/tests/tests_settings/test_settings_global.py +++ b/tests/tests_settings/test_settings_global.py @@ -20,6 +20,7 @@ import pkgutil import re +import nltk import requests import sacremoses import simplemma @@ -142,15 +143,65 @@ def check_settings_global(self): stop_word_lists.append('custom') # NLTK + langs_nltk_sentence_tokenizers_supported = [] + langs_nltk_punkt_sentence_tokenizers = [] langs_nltk_word_tokenizers = [] + for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab/')): + match lang: + case 'english': + langs_nltk_sentence_tokenizers_supported.extend(['en_gb', 'en_us']) + case 'german': + langs_nltk_sentence_tokenizers_supported.extend(['de_at', 'de_de', 'de_ch']) + case 'greek': + langs_nltk_sentence_tokenizers_supported.append('el') + case 'norwegian': + langs_nltk_sentence_tokenizers_supported.append('nb') + case 'portuguese': + langs_nltk_sentence_tokenizers_supported.extend(['pt_br', 'pt_pt']) + case 'README': + pass + case _: + langs_nltk_sentence_tokenizers_supported.append(wl_conversion.to_lang_code( + main, + lang.capitalize(), + iso_639_3 = False + )) + + for lang_code, sentence_tokenizers in settings_sentence_tokenizers.items(): + if ( + lang_code != 'other' + and any(( + sentence_tokenizer.startswith('nltk_punkt_') + for sentence_tokenizer in sentence_tokenizers + )) + ): + langs_nltk_punkt_sentence_tokenizers.append(lang_code) + + self.check_missing_extra_langs( + langs_nltk_sentence_tokenizers_supported, + langs_nltk_punkt_sentence_tokenizers, + "NLTK's Punkt sentence tokenizer" + ) + for lang_code, word_tokenizers in settings_word_tokenizers.items(): - if lang_code != 'other' and any(('nltk_nltk' in word_tokenizer for word_tokenizer in word_tokenizers)): + if ( + lang_code != 'other' + and any(( + 'nltk_nltk' in word_tokenizer + for word_tokenizer in word_tokenizers + )) + ): langs_nltk_word_tokenizers.append(lang_code) for lang_code in settings_word_tokenizers: if lang_code != 'other': - if lang_code not in ['amh', 'mya', 'lzh', 'zho_cn', 'zho_tw', 'jpn', 'khm', 'lao', 'tha', 'bod', 'vie']: + # Exclude languages without spaces between words + if lang_code not in [ + 'amh', 'mya', 'lzh', 'zho_cn', 'zho_tw', + 'jpn', 'khm', 'lao', 'tha', 'bod', + 'vie' + ]: if lang_code not in langs_nltk_word_tokenizers: print(f'''Missing language code "{lang_code}" found for NLTK's tokenizers!''') @@ -174,10 +225,20 @@ def check_settings_global(self): langs_sacremoses_supported = add_lang_suffixes(langs_sacremoses_supported) for lang_code, word_tokenizers in settings_word_tokenizers.items(): - if lang_code != 'other' and any(('sacremoses' in word_tokenizer for word_tokenizer in word_tokenizers)): + if ( + lang_code != 'other' + and any(( + 'sacremoses' in word_tokenizer + for word_tokenizer in word_tokenizers + )) + ): langs_sacremoses_moses_tokenizer.append(lang_code) - self.check_missing_extra_langs(langs_sacremoses_supported, langs_sacremoses_moses_tokenizer, "Sacremoses's Moses tokenizer") + self.check_missing_extra_langs( + langs_sacremoses_supported, + langs_sacremoses_moses_tokenizer, + "Sacremoses's Moses tokenizer" + ) # simplemma langs_simplemma_supported = [] @@ -193,10 +254,17 @@ def check_settings_global(self): langs_simplemma_supported = add_lang_suffixes(langs_simplemma_supported) for lang_code, lemmatizers in settings_lemmatizers.items(): - if any((lemmatizer.startswith('simplemma_') for lemmatizer in lemmatizers)): + if any(( + lemmatizer.startswith('simplemma_') + for lemmatizer in lemmatizers + )): langs_simplemma_lemmatizers.append(lang_code) - self.check_missing_extra_langs(langs_simplemma_supported, langs_simplemma_lemmatizers, "simplemma's lemmatizers") + self.check_missing_extra_langs( + langs_simplemma_supported, + langs_simplemma_lemmatizers, + "simplemma's lemmatizers" + ) # spaCy langs_spacy_supported = [] @@ -228,7 +296,10 @@ def check_settings_global(self): for lang_code, sentence_tokenizers in settings_sentence_tokenizers.items(): if ( lang_code not in ['khm', 'tha', 'bod', 'vie'] - and not any((sentence_tokenizer.startswith('spacy_') for sentence_tokenizer in sentence_tokenizers)) + and not any(( + sentence_tokenizer.startswith('spacy_') + for sentence_tokenizer in sentence_tokenizers + )) ): lang_code_639_1 = wl_conversion.to_iso_639_1(main, lang_code) @@ -237,13 +308,25 @@ def check_settings_global(self): self.lang_utils_missing = True for lang_code, word_tokenizers in settings_word_tokenizers.items(): - if lang_code != 'other' and any(('spacy' in word_tokenizer for word_tokenizer in word_tokenizers)): + if ( + lang_code != 'other' + and any(( + 'spacy' in word_tokenizer + for word_tokenizer in word_tokenizers + )) + ): langs_spacy_word_tokenizers.append(lang_code) self.check_missing_extra_langs(langs_spacy_supported, langs_spacy_word_tokenizers, "spaCy's word tokenizers") for lang_code, lemmatizers in settings_lemmatizers.items(): - if lang_code != 'other' and any(('spacy' in lemmatizer for lemmatizer in lemmatizers)): + if ( + lang_code != 'other' + and any(( + 'spacy' in lemmatizer + for lemmatizer in lemmatizers + )) + ): langs_spacy_lemmatizers.append(lang_code) self.check_missing_extra_langs(langs_spacy_supported_lemmatizers, langs_spacy_lemmatizers, "spaCy's lemmatizers") @@ -271,7 +354,10 @@ def check_settings_global(self): break - r = requests.get(f'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_{ver_stanza}.json', timeout = 10) + r = requests.get( + f'https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_{ver_stanza}.json', + timeout = 10 + ) for lang, lang_resources in r.json().items(): if lang != 'multilingual' and 'default_processors' in lang_resources: @@ -331,7 +417,13 @@ def check_settings_global(self): (settings_sentiment_analyzers, langs_stanza_sentiment_analyzers, langs_stanza_supported_sentiment_analyzers, 'sentiment analyzer') ]: for lang_code, lang_utils in settings_lang_utils.items(): - if lang_code != 'other' and any(('stanza' in lang_util for lang_util in lang_utils)): + if ( + lang_code != 'other' + and any(( + 'stanza' in lang_util + for lang_util in lang_utils + )) + ): langs.append(lang_code) self.check_missing_extra_langs(langs_supported, langs, f"Stanza's {msg_lang_util}") diff --git a/tests/tests_utils/test_conversion.py b/tests/tests_utils/test_conversion.py index 27766a814..5801f1057 100644 --- a/tests/tests_utils/test_conversion.py +++ b/tests/tests_utils/test_conversion.py @@ -40,15 +40,17 @@ def test_normalize_lang_code(): for lang_code in settings_langs.values(): assert wl_conversion.normalize_lang_code(lang_code[0].replace('_', '-').upper()) == lang_code[0] - def test_to_lang_code(): for lang_text, lang_code in settings_langs.items(): assert wl_conversion.to_lang_code(main, lang_text) == lang_code[0] + assert wl_conversion.to_lang_code(main, lang_text, iso_639_3 = False) == lang_code[1] def test_to_lang_codes(): - lang_codes = wl_conversion.to_lang_codes(main, settings_langs.keys()) + lang_codes_639_3 = wl_conversion.to_lang_codes(main, settings_langs.keys()) + lang_codes_639_1 = wl_conversion.to_lang_codes(main, settings_langs.keys(), iso_639_3 = False) - assert list(lang_codes) == [lang_vals[0] for lang_vals in settings_langs.values()] + assert list(lang_codes_639_3) == [lang_vals[0] for lang_vals in settings_langs.values()] + assert list(lang_codes_639_1) == [lang_vals[1] for lang_vals in settings_langs.values()] def test_to_lang_text(): for lang_code in TO_LANG_TEXT.keys(): diff --git a/utils/wl_downloader_ci.py b/utils/wl_downloader_ci.py index 536c63233..dde59372a 100644 --- a/utils/wl_downloader_ci.py +++ b/utils/wl_downloader_ci.py @@ -39,18 +39,12 @@ def run_cli(commands): subprocess.run(['python', '-m'] + commands, check = True) # Download NLTK data -# Corpora -nltk.download('omw-1.4') +nltk.download('averaged_perceptron_tagger_eng') +nltk.download('averaged_perceptron_tagger_rus') +nltk.download('perluniprops') +nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') -nltk.download('words') -# Taggers -nltk.download('averaged_perceptron_tagger') -nltk.download('averaged_perceptron_tagger_ru') -# Tokenizers -nltk.download('punkt') -# Misc -nltk.download('perluniprops') # Download models spacy.cli.download('en_core_web_trf') diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 046d36cf7..1592bf93c 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -1350,7 +1350,7 @@ def init_settings_default(main): 'lij': 'stanza_lij', 'lit': 'spacy_dependency_parser_lit', 'mkd': 'spacy_dependency_parser_mkd', - 'mal': 'nltk_punkt_mal', + 'mal': 'spacy_sentencizer', 'mlt': 'stanza_mlt', 'glv': 'stanza_glv', 'mar': 'stanza_mar', diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index 55be2f4f5..904104474 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -398,7 +398,6 @@ def init_settings_global(): _tr('wl_settings_global', 'NLTK - German Punkt sentence tokenizer'): 'nltk_punkt_deu', _tr('wl_settings_global', 'NLTK - Greek Punkt sentence tokenizer'): 'nltk_punkt_ell', _tr('wl_settings_global', 'NLTK - Italian Punkt sentence tokenizer'): 'nltk_punkt_ita', - _tr('wl_settings_global', 'NLTK - Malayalam Punkt sentence tokenizer'): 'nltk_punkt_mal', _tr('wl_settings_global', 'NLTK - Norwegian (Bokmål) Punkt sentence tokenizer'): 'nltk_punkt_nob', _tr('wl_settings_global', 'NLTK - Polish Punkt sentence tokenizer'): 'nltk_punkt_pol', _tr('wl_settings_global', 'NLTK - Portuguese Punkt sentence tokenizer'): 'nltk_punkt_por', @@ -1639,10 +1638,7 @@ def init_settings_global(): 'spacy_sentencizer' ], - 'mal': [ - 'nltk_punkt_mal', - 'spacy_sentencizer' - ], + 'mal': ['spacy_sentencizer'], 'mlt': [ 'spacy_sentencizer', diff --git a/wordless/wl_utils/wl_conversion.py b/wordless/wl_utils/wl_conversion.py index 10712587a..ef301b04d 100644 --- a/wordless/wl_utils/wl_conversion.py +++ b/wordless/wl_utils/wl_conversion.py @@ -26,11 +26,23 @@ def normalize_lang_code(lang_code): return lang_code.replace('-', '_').lower() -def to_lang_code(main, lang_text): - return main.settings_global['langs'][lang_text][0] - -def to_lang_codes(main, lang_texts): - return (main.settings_global['langs'][lang_text][0] for lang_text in lang_texts) +def to_lang_code(main, lang_text, iso_639_3 = True): + if iso_639_3: + return main.settings_global['langs'][lang_text][0] + else: + return main.settings_global['langs'][lang_text][1] + +def to_lang_codes(main, lang_texts, iso_639_3 = True): + if iso_639_3: + return ( + main.settings_global['langs'][lang_text][0] + for lang_text in lang_texts + ) + else: + return ( + main.settings_global['langs'][lang_text][1] + for lang_text in lang_texts + ) def to_lang_text(main, lang_code): lang_code = normalize_lang_code(lang_code)