From 5184838bd6fc008571c796eb1ead4138fb29ac97 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sun, 5 Jan 2025 20:28:31 +0800 Subject: [PATCH] Work Area: Update Wordlist Generator - Show syllabified forms --- tests/test_wordlist_generator.py | 6 +- .../test_file_area_file_types.py | 3 +- tests/tests_nlp/test_matching.py | 98 ++++++++++--------- wordless/wl_colligation_extractor.py | 11 +-- wordless/wl_collocation_extractor.py | 10 +- wordless/wl_concordancer.py | 2 +- wordless/wl_concordancer_parallel.py | 3 +- wordless/wl_dependency_parser.py | 2 +- wordless/wl_figs/wl_figs.py | 5 +- wordless/wl_keyword_extractor.py | 6 +- .../wl_measures/wl_measures_readability.py | 15 +-- wordless/wl_ngram_generator.py | 8 +- wordless/wl_nlp/wl_sentence_tokenization.py | 11 ++- wordless/wl_nlp/wl_texts.py | 18 ++-- wordless/wl_nlp/wl_token_processing.py | 4 +- wordless/wl_nlp/wl_word_tokenization.py | 7 +- wordless/wl_results/wl_results_filter.py | 16 +-- wordless/wl_results/wl_results_sort.py | 22 ++--- wordless/wl_settings/wl_settings_default.py | 3 +- wordless/wl_settings/wl_settings_files.py | 13 ++- .../wl_settings_word_tokenization.py | 4 +- wordless/wl_widgets/wl_lists.py | 4 +- wordless/wl_widgets/wl_tables.py | 16 +-- wordless/wl_wordlist_generator.py | 42 ++++---- 24 files changed, 180 insertions(+), 149 deletions(-) diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py index 6b6b38a4e..8220b4e03 100644 --- a/tests/test_wordlist_generator.py +++ b/tests/test_wordlist_generator.py @@ -61,7 +61,7 @@ def test_wordlist_generator(): update_gui = update_gui ).run() -def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabification): +def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabified_form): print(err_msg) assert not err_msg @@ -74,8 +74,8 @@ def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabific # Token assert token - # Syllabification - assert tokens_syllabification[token] + # Syllabified Form + assert tokens_syllabified_form[token] # Frequency assert len(freq_files) == num_files_selected + 1 # Dispersion & Adjusted Frequency diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py index 312327c7a..8c4f8acf8 100644 --- a/tests/tests_file_area/test_file_area_file_types.py +++ b/tests/tests_file_area/test_file_area_file_types.py @@ -18,7 +18,6 @@ import glob import os -import re import time from PyQt5.QtCore import QObject @@ -289,7 +288,7 @@ def update_gui_misc(err_msg, new_files): for sentence in para: for sentence_seg in sentence: for token in sentence_seg: - assert not re.search(wl_texts.RE_VIE_TOKENIZED, token) + assert not wl_texts.RE_VIE_TOKENIZED.search(token) if __name__ == '__main__': test_file_area_file_types() diff --git a/tests/tests_nlp/test_matching.py b/tests/tests_nlp/test_matching.py index b0c4b8860..d7911b2fc 100644 --- a/tests/tests_nlp/test_matching.py +++ b/tests/tests_nlp/test_matching.py @@ -73,31 +73,35 @@ def test_get_re_tags(): assert re_tags_body == r'_\S*(?=\s|$)|/\S*(?=\s|$)|_(?=\s|$)|||||' assert re_tags_xml == r'||||||' - assert re.search(re_tags_header, r'token').group() == '' - assert re.search(re_tags_header, r'token').group() == '' - assert re.search(re_tags_header, r'token').group() == '' - assert re.search(re_tags_header, r'< tei Header >token').group() == '< tei Header >' - assert re.search(re_tags_header, r'<>token').group() == '<>' - assert re.search(re_tags_header, r'< >token').group() == '< >' - - assert re.search(re_tags_body, r'token_TAG').group() == '_TAG' - assert re.search(re_tags_body, r'token_T_AG').group() == '_T_AG' - assert re.search(re_tags_body, r'token_').group() == '_' - assert re.search(re_tags_body, r'token/TAG').group() == '/TAG' - assert re.search(re_tags_body, r'token').group() == '' - assert re.search(re_tags_body, r'token').group() == '' - assert re.search(re_tags_body, r'< T AG >token').group() == '< T AG >' - assert re.search(re_tags_body, r'token').group() == '' - assert re.search(re_tags_body, r'<>token').group() == '<>' - assert re.search(re_tags_body, r'< >token').group() == '< >' - assert re.search(re_tags_body, r'< * >token').group() == '< * >' - - assert re.search(re_tags_xml, r'token

').group() == '

' - assert re.search(re_tags_xml, r'

token').group() == '

' - assert re.search(re_tags_xml, r'

token

').group() == '

' - assert re.search(re_tags_xml, r'< p p >token').group() == '< p p >' - assert re.search(re_tags_xml, r'<>token').group() == '<>' - assert re.search(re_tags_xml, r'< >token').group() == '< >' + re_tags_header = re.compile(re_tags_header) + re_tags_body = re.compile(re_tags_body) + re_tags_xml = re.compile(re_tags_xml) + + assert re_tags_header.search(r'token').group() == '' + assert re_tags_header.search(r'token').group() == '' + assert re_tags_header.search(r'token').group() == '' + assert re_tags_header.search(r'< tei Header >token').group() == '< tei Header >' + assert re_tags_header.search(r'<>token').group() == '<>' + assert re_tags_header.search(r'< >token').group() == '< >' + + assert re_tags_body.search(r'token_TAG').group() == '_TAG' + assert re_tags_body.search(r'token_T_AG').group() == '_T_AG' + assert re_tags_body.search(r'token_').group() == '_' + assert re_tags_body.search(r'token/TAG').group() == '/TAG' + assert re_tags_body.search(r'token').group() == '' + assert re_tags_body.search(r'token').group() == '' + assert re_tags_body.search(r'< T AG >token').group() == '< T AG >' + assert re_tags_body.search(r'token').group() == '' + assert re_tags_body.search(r'<>token').group() == '<>' + assert re_tags_body.search(r'< >token').group() == '< >' + assert re_tags_body.search(r'< * >token').group() == '< * >' + + assert re_tags_xml.search(r'token

').group() == '

' + assert re_tags_xml.search(r'

token').group() == '

' + assert re_tags_xml.search(r'

token

').group() == '

' + assert re_tags_xml.search(r'< p p >token').group() == '< p p >' + assert re_tags_xml.search(r'<>token').group() == '<>' + assert re_tags_xml.search(r'< >token').group() == '< >' def test_get_re_tags_with_tokens(): re_tags_header = wl_matching.get_re_tags_with_tokens(main, tag_type = 'header') @@ -108,26 +112,30 @@ def test_get_re_tags_with_tokens(): assert re_tags_body == r'\S*_\S*(?=\s|$)|\S*/\S*(?=\s|$)|\S*_(?=\s|$)|<.*?>.*?|<\ \*\ >.*|<\ T\ AG\ >.*|<>.*|<\ >.*' assert re_tags_xml == r'

.*

|.*|.*|.*|<\ p\ p\ >.*|<>.*|<\ >.*' - assert re.search(re_tags_header, r'token token token').group() == 'token' - assert re.search(re_tags_header, r'token token token').group() == 'token' - assert re.search(re_tags_header, r'token < tei Header >token token').group() == '< tei Header >token' - assert re.search(re_tags_header, r'token <>token token').group() == '<>token' - assert re.search(re_tags_header, r'token < >token token').group() == '< >token' - - assert re.search(re_tags_body, r'token token_TAG token').group() == 'token_TAG' - assert re.search(re_tags_body, r'token token/TAG token').group() == 'token/TAG' - assert re.search(re_tags_body, r'token token_T_AG token').group() == 'token_T_AG' - assert re.search(re_tags_body, r'token token_ token').group() == 'token_' - assert re.search(re_tags_body, r'token token token').group() == 'token' - assert re.search(re_tags_body, r'token < T AG >token token').group() == '< T AG >token' - assert re.search(re_tags_body, r'token <>token token').group() == '<>token' - assert re.search(re_tags_body, r'token < >token token').group() == '< >token' - assert re.search(re_tags_body, r'token < * >token token').group() == '< * >token' - - assert re.search(re_tags_xml, r'token

token

token').group() == '

token

' - assert re.search(re_tags_xml, r'token < p p >token token').group() == '< p p >token' - assert re.search(re_tags_xml, r'token <>token token').group() == '<>token' - assert re.search(re_tags_xml, r'token < >token token').group() == '< >token' + re_tags_header = re.compile(re_tags_header) + re_tags_body = re.compile(re_tags_body) + re_tags_xml = re.compile(re_tags_xml) + + assert re_tags_header.search(r'token token token').group() == 'token' + assert re_tags_header.search(r'token token token').group() == 'token' + assert re_tags_header.search(r'token < tei Header >token token').group() == '< tei Header >token' + assert re_tags_header.search(r'token <>token token').group() == '<>token' + assert re_tags_header.search(r'token < >token token').group() == '< >token' + + assert re_tags_body.search(r'token token_TAG token').group() == 'token_TAG' + assert re_tags_body.search(r'token token/TAG token').group() == 'token/TAG' + assert re_tags_body.search(r'token token_T_AG token').group() == 'token_T_AG' + assert re_tags_body.search(r'token token_ token').group() == 'token_' + assert re_tags_body.search(r'token token token').group() == 'token' + assert re_tags_body.search(r'token < T AG >token token').group() == '< T AG >token' + assert re_tags_body.search(r'token <>token token').group() == '<>token' + assert re_tags_body.search(r'token < >token token').group() == '< >token' + assert re_tags_body.search(r'token < * >token token').group() == '< * >token' + + assert re_tags_xml.search(r'token

token

token').group() == '

token

' + assert re_tags_xml.search(r'token < p p >token token').group() == '< p p >token' + assert re_tags_xml.search(r'token <>token token').group() == '<>token' + assert re_tags_xml.search(r'token < >token token').group() == '< >token' def init_token_settings(assign_pos_tags = False, ignore_tags = False, use_tags = False): return { diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index ef22b1347..c0321b1be 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -500,7 +500,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -573,9 +573,7 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats try: self.settings = copy.deepcopy(self.main.settings_custom) - self.clr_table() - - settings = self.main.settings_custom['colligation_extractor'] + settings = self.settings['colligation_extractor'] test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] @@ -584,6 +582,9 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text'] col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] + self.clr_table() + self.model().setRowCount(len(colligations_freqs_files)) + # Insert columns files = list(self.main.wl_file_area.get_selected_files()) files_with_total = files + [{'name': self.tr('Total')}] @@ -723,8 +724,6 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats freq_totals = numpy.array(list(colligations_freqs_files.values())).sum(axis = 2).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(colligations_freqs_files)) - self.disable_updates() for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(colligations_stats_files)): diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index fbe6c84d0..07ae5d2d4 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -500,7 +500,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -575,9 +575,7 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats try: self.settings = copy.deepcopy(self.main.settings_custom) - self.clr_table() - - settings = self.main.settings_custom['collocation_extractor'] + settings = self.settings['collocation_extractor'] test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] @@ -586,6 +584,9 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text'] col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] + self.clr_table() + self.model().setRowCount(len(collocations_freqs_files)) + # Insert columns files = list(self.main.wl_file_area.get_selected_files()) files_with_total = files + [{'name': self.tr('Total')}] @@ -725,7 +726,6 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats freq_totals = numpy.array(list(collocations_freqs_files.values())).sum(axis = 2).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(collocations_freqs_files)) self.disable_updates() for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(collocations_stats_files)): diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py index 9e7a4ca21..7e1dccacd 100644 --- a/wordless/wl_concordancer.py +++ b/wordless/wl_concordancer.py @@ -426,7 +426,7 @@ def generation_settings_changed(self): settings['context_len_right_para'] = self.spin_box_context_len_right_para.value() settings['context_len_unit'] = self.combo_box_context_len_unit.currentText() - # Width Unit + # Unit of context length if settings['context_len_unit'] == self.tr('Character'): self.stacked_widget_context_len_left.setCurrentIndex(0) self.stacked_widget_context_len_right.setCurrentIndex(0) diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py index 19ddbbe0b..c8c242967 100644 --- a/wordless/wl_concordancer_parallel.py +++ b/wordless/wl_concordancer_parallel.py @@ -308,14 +308,15 @@ def update_gui_table(self, err_msg, concordance_lines): self.settings = copy.deepcopy(self.main.settings_custom) self.clr_table(0) + self.model().setRowCount(len(concordance_lines)) + # Insert columns for file_name in self.main.wl_file_area.get_selected_file_names(): self.ins_header_hor( self.model().columnCount(), file_name ) - self.model().setRowCount(len(concordance_lines)) self.disable_updates() for i, concordance_line in enumerate(concordance_lines): diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py index 5f1b075af..8d9e66391 100644 --- a/wordless/wl_dependency_parser.py +++ b/wordless/wl_dependency_parser.py @@ -399,8 +399,8 @@ def update_gui_table(self, err_msg, results): self.settings = copy.deepcopy(self.main.settings_custom) self.clr_table(0) - self.model().setRowCount(len(results)) + self.disable_updates() for i, ( diff --git a/wordless/wl_figs/wl_figs.py b/wordless/wl_figs/wl_figs.py index 83c0c1c95..23d0ffb52 100644 --- a/wordless/wl_figs/wl_figs.py +++ b/wordless/wl_figs/wl_figs.py @@ -61,7 +61,10 @@ def generate_line_chart( vals = numpy.array([vals for item, vals in data_files_items]) # Frequency data - if fig_settings['use_data'] == _tr('wl_figs', 'Frequency') or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data']): + if ( + fig_settings['use_data'] == _tr('wl_figs', 'Frequency') + or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data']) + ): if fig_settings['use_cumulative']: vals = numpy.cumsum(vals, axis = 0) diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py index a89032472..18fae2aa5 100644 --- a/wordless/wl_keyword_extractor.py +++ b/wordless/wl_keyword_extractor.py @@ -317,7 +317,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -432,7 +432,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): try: self.settings = copy.deepcopy(self.main.settings_custom) - settings = self.main.settings_custom['keyword_extractor'] + settings = self.settings['keyword_extractor'] files_observed = list(self.main.wl_file_area.get_selected_files()) test_statistical_significance = settings['generation_settings']['test_statistical_significance'] @@ -443,6 +443,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] self.clr_table() + self.model().setRowCount(len(keywords_freq_files)) # Insert columns self.ins_header_hor( @@ -548,7 +549,6 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): freq_totals = numpy.array(list(keywords_freq_files.values())).sum(axis = 0) len_files_observed = len(files_observed) - self.model().setRowCount(len(keywords_freq_files)) self.disable_updates() for i, (keyword, stats_files) in enumerate(wl_sorting.sorted_stats_files_items(keywords_stats_files)): diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index 372feaabc..0323f8cd1 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -1083,20 +1083,21 @@ def nws(main, text): # References: # https://github.com/drelhaj/OsmanReadability/blob/master/src/org/project/osman/process/Syllables.java # https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569 +RE_STRESS = re.compile(r'[\u064B\u064C\u064D\u0651]') +RE_SHORT = re.compile(r'[\u0627\u0649\?\.\!\,\s]') + def _get_num_syls_ara(word): count_short = 0 count_long = 0 - # Tashkeel: fatha, damma, kasra - tashkeel = ['\u064E', '\u064F', '\u0650'] - for i, char in enumerate(word): - if char not in tashkeel: + # Tashkeel: fatha, damma, kasra + if char not in ('\u064E', '\u064F', '\u0650'): continue # Only if a character is a tashkeel, has a successor, and is followed by an alef, waw, or yeh if i + 1 < len(word): - if word[i + 1] in ['\u0627', '\u0648', '\u064A']: + if word[i + 1] in ('\u0627', '\u0648', '\u064A'): count_long += 1 else: count_short += 1 @@ -1104,10 +1105,10 @@ def _get_num_syls_ara(word): count_short += 1 # Stress syllables: tanween fatha, tanween damma, tanween kasra, shadda - count_stress = len(re.findall(r'[\u064B\u064C\u064D\u0651]', word)) + count_stress = len(RE_STRESS.findall(word)) if count_short == 0: - word = re.sub(r'[\u0627\u0649\?\.\!\,\s]', '', word) + word = RE_SHORT.sub('', word) count_short = max(0, len(word) - 2) # Reference: https://github.com/drelhaj/OsmanReadability/blob/405b927ef3fde200fa08efe12ec2f39b8716e4be/src/org/project/osman/process/OsmanReadability.java#L259 diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py index 5e068516a..6d588055b 100644 --- a/wordless/wl_ngram_generator.py +++ b/wordless/wl_ngram_generator.py @@ -504,7 +504,7 @@ def generation_settings_changed(self): settings['measure_dispersion'] = self.combo_box_measure_dispersion.get_measure() settings['measure_adjusted_freq'] = self.combo_box_measure_adjusted_freq.get_measure() - # Keyword Position + # Search term position if self.spin_box_search_term_position_max.value() == self.spin_box_search_term_position_max.maximum(): self.spin_box_search_term_position_min.setMaximum(settings['ngram_size_max']) self.spin_box_search_term_position_max.setMaximum(settings['ngram_size_max']) @@ -520,7 +520,7 @@ def generation_settings_changed(self): else: self.spin_box_allow_skipped_tokens.setEnabled(False) - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -590,7 +590,7 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): try: self.settings = copy.deepcopy(self.main.settings_custom) - settings = self.main.settings_custom['ngram_generator'] + settings = self.settings['ngram_generator'] measure_dispersion = settings['generation_settings']['measure_dispersion'] measure_adjusted_freq = settings['generation_settings']['measure_adjusted_freq'] @@ -599,6 +599,7 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] self.clr_table() + self.model().setRowCount(len(ngrams_freq_files)) # Insert columns files = list(self.main.wl_file_area.get_selected_files()) @@ -658,7 +659,6 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): freq_totals = numpy.array(list(ngrams_freq_files.values())).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(ngrams_freq_files)) self.disable_updates() for i, (ngram, freq_files) in enumerate(wl_sorting.sorted_freq_files_items(ngrams_freq_files)): diff --git a/wordless/wl_nlp/wl_sentence_tokenization.py b/wordless/wl_nlp/wl_sentence_tokenization.py index 870864298..b1b63c800 100644 --- a/wordless/wl_nlp/wl_sentence_tokenization.py +++ b/wordless/wl_nlp/wl_sentence_tokenization.py @@ -207,9 +207,11 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'): ]))) def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): + re_terminators = re.compile(fr'.+?[{terminators}]+\s|.+?$') + return [ sentence.strip() - for sentence in re.findall(fr'.+?[{terminators}]+\s|.+?$', text.strip()) + for sentence in re_terminators.findall(text.strip()) ] # Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:] @@ -290,9 +292,11 @@ def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): ]))) def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS): + re_terminators = re.compile(fr'.+?[{terminators}]+|.+?$') + return [ sentence_seg.strip() - for sentence_seg in re.findall(fr'.+?[{terminators}]+|.+?$', text.strip()) + for sentence_seg in re_terminators.findall(text.strip()) ] REPLACEMENT_CHAR = '\uFFFF' @@ -300,8 +304,9 @@ def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS) def wl_sentence_seg_tokenize_tokens(main, tokens, terminators = SENTENCE_SEG_TERMINATORS): # Insert a replacement character between tokens to prevent text from being split within tokens text = REPLACEMENT_CHAR.join(tokens) + re_terminators = re.compile(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$') return [ wl_texts.clean_texts(sentence_seg.split(REPLACEMENT_CHAR)) - for sentence_seg in re.findall(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$', text.strip()) + for sentence_seg in re_terminators.findall(text.strip()) ] diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py index 0c914b0a9..cf3e73f2c 100644 --- a/wordless/wl_nlp/wl_texts.py +++ b/wordless/wl_nlp/wl_texts.py @@ -232,7 +232,7 @@ def __init__(self, main, file): # Untokenized & Tagged elif not self.tokenized and self.tagged: # Replace all tags with a whitespace character to ensure that no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) # Remove redundant whitespace characters so that sentences are split correctly text_no_tags = re.sub(r'\s{2}', ' ', text_no_tags) @@ -244,7 +244,7 @@ def __init__(self, main, file): text = self.check_tags_text_start(text) i_tag_end = 0 - for tag in re.finditer(re_tags, text): + for tag in re_tags.finditer(text): tags_tokens = self.add_tags_tokenization(text[i_tag_end:tag.start()], tags_tokens) tags_tokens[-1].append(tag.group()) @@ -280,7 +280,7 @@ def __init__(self, main, file): if para: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', para) + text_no_tags = re_tags.sub(' ', para) for sentence in wl_sentence_tokenization.wl_sentence_split(self.main, text_no_tags): self.tokens_multilevel[-1].append([]) @@ -294,7 +294,7 @@ def __init__(self, main, file): # Extract tags i_tag_end = 0 - for tag in re.finditer(re_tags, para): + for tag in re_tags.finditer(para): tags_tokens = self.add_tags_splitting(para[i_tag_end:tag.start()], tags_tokens) tags_tokens[-1].append(tag.group()) @@ -364,7 +364,7 @@ def __init__(self, main, file): for sentence in para: for i, sentence_seg in enumerate(sentence): sentence[i] = [ - re.sub(RE_VIE_TOKENIZED, ' ', token) + RE_VIE_TOKENIZED.sub(' ', token) for token in sentence_seg ] @@ -400,7 +400,7 @@ def check_tags_text_start(self, text): re_tag_text_start = re.compile(fr"\s*({wl_matching.get_re_tags(self.main, tag_type = 'body')})") self.tags_text_start = [] - while (re_result := re.match(re_tag_text_start, text)): + while (re_result := re_tag_text_start.match(text)): tag = re_result.group() self.tags_text_start.append(tag) @@ -613,7 +613,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called # Untokenized & Tagged elif not self.tokenized and self.tagged: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) tokens = wl_word_tokenization.wl_word_tokenize_flat(self.main, text_no_tags, lang = self.lang) self.tokens_multilevel[0][0][0].extend(tokens) @@ -623,7 +623,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called # Tokenized & Tagged elif self.tokenized and self.tagged: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) self.tokens_multilevel[0][0][0].extend(text_no_tags.split()) elif file_ext == '.xml' and self.tagged: @@ -658,7 +658,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called for sentence in para: for i, sentence_seg in enumerate(sentence): sentence[i] = [ - re.sub(RE_VIE_TOKENIZED, ' ', token) + RE_VIE_TOKENIZED.sub(' ', token) for token in sentence_seg ] diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py index 9647896c6..4614f1845 100644 --- a/wordless/wl_nlp/wl_token_processing.py +++ b/wordless/wl_nlp/wl_token_processing.py @@ -280,8 +280,8 @@ def wl_process_tokens_profiler(main, text, token_settings, tab): return text_modified def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings): - # Syllabification - if generation_settings['syllabification']: + # Show syllabified forms + if generation_settings['show_syllabified_forms']: text_syl_tokenize(main, text) text_modified = wl_process_tokens_ngram_generator(main, text, token_settings) diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py index 5caf340fd..383c56dbd 100644 --- a/wordless/wl_nlp/wl_word_tokenization.py +++ b/wordless/wl_nlp/wl_word_tokenization.py @@ -28,6 +28,9 @@ from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts from wordless.wl_utils import wl_conversion, wl_misc +RE_CHAR_HAN_OTHER = re.compile(r'^h+|^o+') +RE_CHAR_HAN_KANJI_OTHER = re.compile(r'^h+|^k+|^o+') + def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): tokens_multilevel = [] @@ -126,7 +129,7 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): char_scripts += 'o' while sentence: - len_token = len(re.search(r'^h+|^o+', char_scripts).group()) + len_token = len(RE_CHAR_HAN_OTHER.search(char_scripts).group()) token = sentence[:len_token] if char_scripts.startswith('h'): @@ -174,7 +177,7 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): char_scripts += 'o' while sentence: - len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group()) + len_token = len(RE_CHAR_HAN_KANJI_OTHER.search(char_scripts).group()) token = sentence[:len_token] if char_scripts.startswith('h'): diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py index 325886fba..eaa48392d 100644 --- a/wordless/wl_results/wl_results_filter.py +++ b/wordless/wl_results/wl_results_filter.py @@ -388,9 +388,9 @@ def __init__(self, main, table): self.col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] if self.tab == 'wordlist_generator': - self.has_syllabification = settings['generation_settings']['syllabification'] + self.has_syllabified_forms = settings['generation_settings']['show_syllabified_forms'] else: - self.has_syllabification = True + self.has_syllabified_forms = True self.has_dispersion = measure_dispersion != 'none' self.has_adjusted_freq = measure_adjusted_freq != 'none' @@ -409,7 +409,7 @@ def __init__(self, main, table): settings = self.settings, filter_name = f'len_{self.type_node}' )) - if self.tab == 'wordlist_generator' and settings['generation_settings']['syllabification']: + if self.tab == 'wordlist_generator' and settings['generation_settings']['show_syllabified_forms']: self.layouts_filters.append(widgets_filter( self, label = self.tr('Number of syllables:'), @@ -474,8 +474,8 @@ def run(self): if self.dialog.tab == 'wordlist_generator': col_node = self.dialog.table.find_header_hor(self.tr('Token')) - if self.dialog.has_syllabification: - col_num_syls = self.dialog.table.find_header_hor(self.tr('Syllabification')) + if self.dialog.has_syllabified_forms: + col_num_syls = self.dialog.table.find_header_hor(self.tr('Syllabified Form')) elif self.dialog.tab == 'ngram_generator': col_node = self.dialog.table.find_header_hor(self.tr('N-gram')) @@ -543,11 +543,11 @@ def run(self): if self.dialog.table.model().item(i, col_freq).val > 0: filters.append(len_node_min <= len_node <= len_node_max) - if self.dialog.tab == 'wordlist_generator' and self.dialog.has_syllabification: + if self.dialog.tab == 'wordlist_generator' and self.dialog.has_syllabified_forms: filter_num_syls = False - syllabification = self.dialog.table.model().item(i, col_num_syls).text() + syllabified_form = self.dialog.table.model().item(i, col_num_syls).text() - for syls in syllabification.split(', '): + for syls in syllabified_form.split(', '): if num_syls_min <= len(syls.split('-')) <= num_syls_max: filter_num_syls = True diff --git a/wordless/wl_results/wl_results_sort.py b/wordless/wl_results/wl_results_sort.py index 3d43112aa..968756399 100644 --- a/wordless/wl_results/wl_results_sort.py +++ b/wordless/wl_results/wl_results_sort.py @@ -40,6 +40,9 @@ _tr = QCoreApplication.translate +RE_SORTING_COL_L = re.compile(_tr('Wl_Dialog_Results_Sort_Concordancer', r'^L[1-9][0-9]*$')) +RE_SORTING_COL_R = re.compile(_tr('Wl_Dialog_Results_Sort_Concordancer', r'^R[1-9][0-9]*$')) + class Wl_Dialog_Results_Sort_Concordancer(wl_dialogs.Wl_Dialog): def __init__(self, main, table): super().__init__( @@ -133,9 +136,6 @@ def update_gui(self, results): results[i][2] = right_new # Sort results - re_sorting_col_l = re.compile(self.tr(r'^L[1-9][0-9]*$')) - re_sorting_col_r = re.compile(self.tr(r'^R[1-9][0-9]*$')) - for sorting_col, sorting_order in reversed(self.settings['sorting_rules']): reverse = 0 if sorting_order == self.tr('Ascending') else 1 @@ -151,9 +151,9 @@ def update_gui(self, results): else: span = int(sorting_col[1:]) - if re_sorting_col_l.search(sorting_col): + if RE_SORTING_COL_L.search(sorting_col): results.sort(key = lambda item, span = span: item[0].tokens_raw[-span], reverse = reverse) - elif re_sorting_col_r.search(sorting_col): + elif RE_SORTING_COL_R.search(sorting_col): results.sort(key = lambda item, span = span: item[2].tokens_raw[span - 1], reverse = reverse) # Clear highlights before sorting the results @@ -193,7 +193,7 @@ def update_gui(self, results): i_highlight_color_right = 1 for sorting_col, _ in self.settings['sorting_rules']: - if re_sorting_col_l.search(sorting_col) and int(sorting_col[1:]) <= len(text_left): + if RE_SORTING_COL_L.search(sorting_col) and int(sorting_col[1:]) <= len(text_left): hightlight_color = highlight_colors[i_highlight_color_left % len(highlight_colors)] text_left[-int(sorting_col[1:])] = f''' @@ -203,7 +203,7 @@ def update_gui(self, results): ''' i_highlight_color_left += 1 - elif re_sorting_col_r.search(sorting_col) and int(sorting_col[1:]) - 1 < len(text_right): + elif RE_SORTING_COL_R.search(sorting_col) and int(sorting_col[1:]) - 1 < len(text_right): hightlight_color = highlight_colors[i_highlight_color_right % len(highlight_colors)] text_right[int(sorting_col[1:]) - 1] = f''' @@ -382,7 +382,7 @@ def max_left(self): max_left = max(( int(col[1:]) for col in self.cols_to_sort - if re.search(self.tr(r'^L[0-9]+$'), col) + if RE_SORTING_COL_L.search(col) )) else: max_left = 0 @@ -394,7 +394,7 @@ def max_right(self): max_right = max(( int(col[1:]) for col in self.cols_to_sort - if re.search(self.tr(r'^R[0-9]+$'), col) + if RE_SORTING_COL_R.search(col) )) else: max_right = 0 @@ -418,12 +418,12 @@ def _add_row(self, row = None, texts = None): cols_left = sorted([ int(self.model().item(i, 0).text()[1:]) for i in range(self.model().rowCount()) - if re.search(self.tr(r'^L[0-9]+$'), self.model().item(i, 0).text()) + if RE_SORTING_COL_L.search(self.model().item(i, 0).text()) ]) cols_right = sorted([ int(self.model().item(i, 0).text()[1:]) for i in range(self.model().rowCount()) - if re.search(self.tr(r'^R[0-9]+$'), self.model().item(i, 0).text()) + if RE_SORTING_COL_R.search(self.model().item(i, 0).text()) ]) if sorting_col: diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 313f0ef9a..6dcfce899 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -490,7 +490,8 @@ def init_settings_default(main): }, 'generation_settings': { - 'syllabification': True, + 'show_syllabified_forms': True, + 'measure_dispersion': 'juillands_d', 'measure_adjusted_freq': 'juillands_u' }, diff --git a/wordless/wl_settings/wl_settings_files.py b/wordless/wl_settings/wl_settings_files.py index 8bb980e57..9c34bf705 100644 --- a/wordless/wl_settings/wl_settings_files.py +++ b/wordless/wl_settings/wl_settings_files.py @@ -243,6 +243,11 @@ def apply_settings(self): return True # self.tr() does not work in inherited classes +RE_TAG_EMBEDDED = re.compile(r'^([^\w\s]|_)+\S*$') +RE_TAG_NON_EMBEDDED = re.compile(r'^([^\w\s]|_)+\S*([^\w\s]|_)+$') +RE_TAG_HTML_BRACKETS = re.compile(r'(^<)|(>$)') +RE_TAG_HTML_PARENTHESES = re.compile(r'\s\((\d+)\)') + class Wl_Table_Tags(wl_tables.Wl_Table_Add_Ins_Del_Clr): def __init__(self, parent, settings_tags, defaults_row): super().__init__( @@ -283,12 +288,12 @@ def item_changed(self, item): # pylint: disable=arguments-differ # Opening Tag if self.model().item(row, 0).text() == _tr('Wl_Table_Tags', 'Embedded'): - re_validation = re.search(r'^([^\w\s]|_)+\S*$', item_opening_tag.text()) + re_validation = RE_TAG_EMBEDDED.search(item_opening_tag.text()) warning_text = _tr('Wl_Table_Tags', '''
Embedded tags must begin with a punctuation mark, e.g. an underscore or a slash!
''') else: - re_validation = re.search(r'^([^\w\s]|_)+\S*([^\w\s]|_)+$', item_opening_tag.text()) + re_validation = RE_TAG_NON_EMBEDDED.search(item_opening_tag.text()) warning_text = _tr('Wl_Table_Tags', '''
Non-embedded tags must begin and end with a punctuation mark, e.g. brackets!
''') @@ -368,7 +373,7 @@ def _add_row(self, row = None, texts = None): # HTML tags if opening_tag.startswith('<') and opening_tag.endswith('>'): opening_tags = [ - re.sub(r'(^<)|(>$)', r'', self.model().item(i, 2).text()) + RE_TAG_HTML_BRACKETS.sub(r'', self.model().item(i, 2).text()) for i in range(self.model().rowCount()) ] opening_tag = f"<{wl_checks_misc.check_new_name(opening_tag[1:-1], opening_tags, separator = '')}>" @@ -376,7 +381,7 @@ def _add_row(self, row = None, texts = None): opening_tags = [self.model().item(i, 2).text() for i in range(self.model().rowCount())] opening_tag = wl_checks_misc.check_new_name(opening_tag, opening_tags, separator = '') - opening_tag = re.sub(r'\s\((\d+)\)', r'\1', opening_tag) + opening_tag = RE_TAG_HTML_PARENTHESES.sub(r'\1', opening_tag) else: type_, level, opening_tag, _ = texts diff --git a/wordless/wl_settings/wl_settings_word_tokenization.py b/wordless/wl_settings/wl_settings_word_tokenization.py index 06529e7b1..f356c4a21 100644 --- a/wordless/wl_settings/wl_settings_word_tokenization.py +++ b/wordless/wl_settings/wl_settings_word_tokenization.py @@ -220,6 +220,8 @@ def apply_settings(self): return True +RE_VIE_SPECES_UNDERSCORES = re.compile(r'\s+') + class Wl_Worker_Preview_Word_Tokenizer(wl_threading.Wl_Worker_No_Progress): worker_done = pyqtSignal(list) @@ -241,7 +243,7 @@ def run(self): # Replace spaces with underscores in Vietnamese texts if preview_lang == 'vie': - tokens = [re.sub(r'\s+', r'_', token) for token in tokens] + tokens = [RE_VIE_SPECES_UNDERSCORES.sub(r'_', token) for token in tokens] preview_results.append(' '.join(tokens)) diff --git a/wordless/wl_widgets/wl_lists.py b/wordless/wl_widgets/wl_lists.py index 8c798a1c4..89c8704e9 100644 --- a/wordless/wl_widgets/wl_lists.py +++ b/wordless/wl_widgets/wl_lists.py @@ -41,6 +41,8 @@ _tr = QCoreApplication.translate +RE_EMPTY_ITEM = re.compile(r'^\s*$') + # self.tr() does not work in inherited classes class Wl_List_Add_Ins_Del_Clr(QListView): def __init__( @@ -117,7 +119,7 @@ def data_changed(self, topLeft = None, bottomRight = None): # pylint: disable=un item_text = self.model().stringList()[item_row] # Check for empty items - if re.search(r'^\s*$', item_text): + if RE_EMPTY_ITEM.search(item_text): data = self.model().stringList() data[item_row] = self.items_old[item_row] diff --git a/wordless/wl_widgets/wl_tables.py b/wordless/wl_widgets/wl_tables.py index 8335b6513..11adfaa0a 100644 --- a/wordless/wl_widgets/wl_tables.py +++ b/wordless/wl_widgets/wl_tables.py @@ -502,6 +502,10 @@ def update_gui_exp(self, err_msg, file_path): wl_checks_work_area.check_err_exp_table(self.main, err_msg, file_path) +RE_REDUNDANT_SPACES = re.compile(r'\s+') +RE_INVALID_XML_CHARS = re.compile(r'[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]+') +RE_COLOR = re.compile(r'(?<=color: #)([0-9a-fA-F]{3}|[0-9a-fA-F]{6})(?=;)') + class Wl_Worker_Exp_Table(wl_threading.Wl_Worker): worker_done = pyqtSignal(str, str) @@ -756,7 +760,7 @@ def run(self): def clean_text_csv(self, items): for i, item in enumerate(items): items[i] = item.replace('\n', ' ') - items[i] = re.sub(r'\s+', ' ', items[i]) + items[i] = RE_REDUNDANT_SPACES.sub(' ', items[i]) items[i] = items[i].strip() return items @@ -765,7 +769,7 @@ def clean_text_csv(self, items): def remove_invalid_xml_chars(self, text): # openpyxl.cell.cell.ILLEGAL_CHARACTERS_RE is not complete # Reference: https://www.w3.org/TR/xml/#charsets - return re.sub(r'[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', text) + return RE_INVALID_XML_CHARS.sub('', text) def style_header(self, cell): cell.font = openpyxl.styles.Font( @@ -922,9 +926,7 @@ def style_cell_rich_text(self, cell, item): if isinstance(html, bs4.element.Tag) and html.has_attr('style'): style = html['style'] - re_color = re.search(r'(?<=color: #)([0-9a-fA-F]{3}|[0-9a-fA-F]{6})(?=;)', style) - - if re_color: + if (re_color := RE_COLOR.search(style)): color = re_color.group() # 3-digit color shorthand @@ -1010,9 +1012,7 @@ def style_para_rich_text(self, para, para_text, item): if isinstance(html, bs4.element.Tag) and html.has_attr('style'): style = html['style'] - re_color = re.search(r'(?<=color: #)([0-9a-fA-F]{3}|[0-9a-fA-F]{6})(?=;)', style) - - if re_color: + if (re_color := RE_COLOR.search(style)): color = re_color.group() # 3-digit color shorthand diff --git a/wordless/wl_wordlist_generator.py b/wordless/wl_wordlist_generator.py index aa332f3f4..e284ea460 100644 --- a/wordless/wl_wordlist_generator.py +++ b/wordless/wl_wordlist_generator.py @@ -128,7 +128,7 @@ def __init__(self, main): # Generation Settings self.group_box_generation_settings = QGroupBox(self.tr('Generation Settings')) - self.checkbox_syllabification = QCheckBox(self.tr('Syllabification')) + self.checkbox_show_syllabified_forms = QCheckBox(self.tr('Show syllabified forms')) ( self.label_measure_dispersion, self.combo_box_measure_dispersion, @@ -136,12 +136,12 @@ def __init__(self, main): self.combo_box_measure_adjusted_freq ) = wl_widgets.wl_widgets_measures_wordlist_ngram_generation(self) - self.checkbox_syllabification.stateChanged.connect(self.generation_settings_changed) + self.checkbox_show_syllabified_forms.stateChanged.connect(self.generation_settings_changed) self.combo_box_measure_dispersion.currentTextChanged.connect(self.generation_settings_changed) self.combo_box_measure_adjusted_freq.currentTextChanged.connect(self.generation_settings_changed) self.group_box_generation_settings.setLayout(wl_layouts.Wl_Layout()) - self.group_box_generation_settings.layout().addWidget(self.checkbox_syllabification, 0, 0) + self.group_box_generation_settings.layout().addWidget(self.checkbox_show_syllabified_forms, 0, 0) self.group_box_generation_settings.layout().addWidget(self.label_measure_dispersion, 1, 0) self.group_box_generation_settings.layout().addWidget(self.combo_box_measure_dispersion, 2, 0) self.group_box_generation_settings.layout().addWidget(self.label_measure_adjusted_freq, 3, 0) @@ -266,7 +266,8 @@ def load_settings(self, defaults = False): self.checkbox_use_tags.setChecked(settings['token_settings']['use_tags']) # Generation Settings - self.checkbox_syllabification.setChecked(settings['generation_settings']['syllabification']) + self.checkbox_show_syllabified_forms.setChecked(settings['generation_settings']['show_syllabified_forms']) + self.combo_box_measure_dispersion.set_measure(settings['generation_settings']['measure_dispersion']) self.combo_box_measure_adjusted_freq.set_measure(settings['generation_settings']['measure_adjusted_freq']) @@ -313,11 +314,12 @@ def token_settings_changed(self): def generation_settings_changed(self): settings = self.main.settings_custom['wordlist_generator']['generation_settings'] - settings['syllabification'] = self.checkbox_syllabification.isChecked() + settings['show_syllabified_forms'] = self.checkbox_show_syllabified_forms.isChecked() + settings['measure_dispersion'] = self.combo_box_measure_dispersion.get_measure() settings['measure_adjusted_freq'] = self.combo_box_measure_adjusted_freq.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -383,7 +385,7 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ try: self.settings = copy.deepcopy(self.main.settings_custom) - settings = self.main.settings_custom['wordlist_generator'] + settings = self.settings['wordlist_generator'] measure_dispersion = settings['generation_settings']['measure_dispersion'] measure_adjusted_freq = settings['generation_settings']['measure_adjusted_freq'] @@ -392,12 +394,13 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] self.clr_table() + self.model().setRowCount(len(tokens_freq_files)) # Insert columns - if settings['generation_settings']['syllabification']: + if settings['generation_settings']['show_syllabified_forms']: self.ins_header_hor( self.model().columnCount() - 2, - self.tr('Syllabification') + self.tr('Syllabified Form') ) files = list(self.main.wl_file_area.get_selected_files()) @@ -457,7 +460,6 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ freq_totals = numpy.array(list(tokens_freq_files.values())).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(tokens_freq_files)) self.disable_updates() for i, (token, freq_files) in enumerate(wl_sorting.sorted_freq_files_items(tokens_freq_files)): @@ -470,8 +472,8 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ self.model().setItem(i, 1, wl_tables.Wl_Table_Item(token.display_text())) self.model().item(i, 1).tokens_filter = [token] - # Syllabification - if settings['generation_settings']['syllabification']: + # Syllabified Form + if settings['generation_settings']['show_syllabified_forms']: # Use tags only if settings['token_settings']['use_tags']: self.set_item_err( @@ -480,12 +482,12 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ alignment_hor = 'left' ) elif len(syls_tokens[token]) == 1: - token_syllabified = list(syls_tokens[token].values())[0] + token_syllabified_form = list(syls_tokens[token].values())[0] - if token_syllabified == self.tr('No language support'): - self.set_item_err(i, 2, token_syllabified, alignment_hor = 'left') + if token_syllabified_form == self.tr('No language support'): + self.set_item_err(i, 2, token_syllabified_form, alignment_hor = 'left') else: - self.model().setItem(i, 2, wl_tables.Wl_Table_Item(token_syllabified)) + self.model().setItem(i, 2, wl_tables.Wl_Table_Item(token_syllabified_form)) # Same token found in more than one language else: token_syllabified_forms = [] @@ -494,12 +496,12 @@ def update_gui_table(self, err_msg, tokens_freq_files, tokens_stats_files, syls_ lang_text = wl_conversion.to_lang_text(self.main, lang) token_syllabified_forms.append(f"{syllabified_form} [{lang_text}]") - tokens_syllabified = ', '.join(token_syllabified_forms) + token_syllabified_forms = ', '.join(token_syllabified_forms) - if self.tr('No language support') in tokens_syllabified: - self.set_item_err(i, 2, tokens_syllabified, alignment_hor = 'left') + if self.tr('No language support') in token_syllabified_forms: + self.set_item_err(i, 2, token_syllabified_forms, alignment_hor = 'left') else: - self.model().setItem(i, 2, wl_tables.Wl_Table_Item(tokens_syllabified)) + self.model().setItem(i, 2, wl_tables.Wl_Table_Item(token_syllabified_forms)) # Frequency for j, freq in enumerate(freq_files):