diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py
index 6b6b38a4e..8220b4e03 100644
--- a/tests/test_wordlist_generator.py
+++ b/tests/test_wordlist_generator.py
@@ -61,7 +61,7 @@ def test_wordlist_generator():
update_gui = update_gui
).run()
-def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabification):
+def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabified_form):
print(err_msg)
assert not err_msg
@@ -74,8 +74,8 @@ def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabific
# Token
assert token
- # Syllabification
- assert tokens_syllabification[token]
+ # Syllabified Form
+ assert tokens_syllabified_form[token]
# Frequency
assert len(freq_files) == num_files_selected + 1
# Dispersion & Adjusted Frequency
diff --git a/tests/tests_file_area/test_file_area_file_types.py b/tests/tests_file_area/test_file_area_file_types.py
index 312327c7a..8c4f8acf8 100644
--- a/tests/tests_file_area/test_file_area_file_types.py
+++ b/tests/tests_file_area/test_file_area_file_types.py
@@ -18,7 +18,6 @@
import glob
import os
-import re
import time
from PyQt5.QtCore import QObject
@@ -289,7 +288,7 @@ def update_gui_misc(err_msg, new_files):
for sentence in para:
for sentence_seg in sentence:
for token in sentence_seg:
- assert not re.search(wl_texts.RE_VIE_TOKENIZED, token)
+ assert not wl_texts.RE_VIE_TOKENIZED.search(token)
if __name__ == '__main__':
test_file_area_file_types()
diff --git a/tests/tests_nlp/test_matching.py b/tests/tests_nlp/test_matching.py
index b0c4b8860..d7911b2fc 100644
--- a/tests/tests_nlp/test_matching.py
+++ b/tests/tests_nlp/test_matching.py
@@ -73,31 +73,35 @@ def test_get_re_tags():
assert re_tags_body == r'_\S*(?=\s|$)|/\S*(?=\s|$)|_(?=\s|$)|?.*?>|?\ \*\ >|?\ T\ AG\ >|?>|?\ >'
assert re_tags_xml == r'?p>|?s>|?w>|?c>|?\ p\ p\ >|?>|?\ >'
- assert re.search(re_tags_header, r'token ').group() == ' '
- assert re.search(re_tags_xml, r'
token
').group() == ''
- assert re.search(re_tags_xml, r'< p p >token p p >').group() == '< p p >'
- assert re.search(re_tags_xml, r'<>token>').group() == '<>'
- assert re.search(re_tags_xml, r'< >token >').group() == '< >'
+ re_tags_header = re.compile(re_tags_header)
+ re_tags_body = re.compile(re_tags_body)
+ re_tags_xml = re.compile(re_tags_xml)
+
+ assert re_tags_header.search(r'token ').group() == ' '
+ assert re_tags_xml.search(r'
token
').group() == '' + assert re_tags_xml.search(r'< p p >token p p >').group() == '< p p >' + assert re_tags_xml.search(r'<>token>').group() == '<>' + assert re_tags_xml.search(r'< >token >').group() == '< >' def test_get_re_tags_with_tokens(): re_tags_header = wl_matching.get_re_tags_with_tokens(main, tag_type = 'header') @@ -108,26 +112,30 @@ def test_get_re_tags_with_tokens(): assert re_tags_body == r'\S*_\S*(?=\s|$)|\S*/\S*(?=\s|$)|\S*_(?=\s|$)|<.*?>.*?|<\ \*\ >.*\ \*\ >|<\ T\ AG\ >.*\ T\ AG\ >|<>.*>|<\ >.*\ >' assert re_tags_xml == r'
.*
|token
token').group() == 'token
' - assert re.search(re_tags_xml, r'token < p p >token p p > token').group() == '< p p >token p p >' - assert re.search(re_tags_xml, r'token <>token> token').group() == '<>token>' - assert re.search(re_tags_xml, r'token < >token > token').group() == '< >token >' + re_tags_header = re.compile(re_tags_header) + re_tags_body = re.compile(re_tags_body) + re_tags_xml = re.compile(re_tags_xml) + + assert re_tags_header.search(r'tokentoken
token').group() == 'token
' + assert re_tags_xml.search(r'token < p p >token p p > token').group() == '< p p >token p p >' + assert re_tags_xml.search(r'token <>token> token').group() == '<>token>' + assert re_tags_xml.search(r'token < >token > token').group() == '< >token >' def init_token_settings(assign_pos_tags = False, ignore_tags = False, use_tags = False): return { diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index ef22b1347..c0321b1be 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -500,7 +500,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -573,9 +573,7 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats try: self.settings = copy.deepcopy(self.main.settings_custom) - self.clr_table() - - settings = self.main.settings_custom['colligation_extractor'] + settings = self.settings['colligation_extractor'] test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] @@ -584,6 +582,9 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text'] col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] + self.clr_table() + self.model().setRowCount(len(colligations_freqs_files)) + # Insert columns files = list(self.main.wl_file_area.get_selected_files()) files_with_total = files + [{'name': self.tr('Total')}] @@ -723,8 +724,6 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats freq_totals = numpy.array(list(colligations_freqs_files.values())).sum(axis = 2).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(colligations_freqs_files)) - self.disable_updates() for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(colligations_stats_files)): diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index fbe6c84d0..07ae5d2d4 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -500,7 +500,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -575,9 +575,7 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats try: self.settings = copy.deepcopy(self.main.settings_custom) - self.clr_table() - - settings = self.main.settings_custom['collocation_extractor'] + settings = self.settings['collocation_extractor'] test_statistical_significance = settings['generation_settings']['test_statistical_significance'] measure_bayes_factor = settings['generation_settings']['measure_bayes_factor'] @@ -586,6 +584,9 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text'] col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] + self.clr_table() + self.model().setRowCount(len(collocations_freqs_files)) + # Insert columns files = list(self.main.wl_file_area.get_selected_files()) files_with_total = files + [{'name': self.tr('Total')}] @@ -725,7 +726,6 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats freq_totals = numpy.array(list(collocations_freqs_files.values())).sum(axis = 2).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(collocations_freqs_files)) self.disable_updates() for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(collocations_stats_files)): diff --git a/wordless/wl_concordancer.py b/wordless/wl_concordancer.py index 9e7a4ca21..7e1dccacd 100644 --- a/wordless/wl_concordancer.py +++ b/wordless/wl_concordancer.py @@ -426,7 +426,7 @@ def generation_settings_changed(self): settings['context_len_right_para'] = self.spin_box_context_len_right_para.value() settings['context_len_unit'] = self.combo_box_context_len_unit.currentText() - # Width Unit + # Unit of context length if settings['context_len_unit'] == self.tr('Character'): self.stacked_widget_context_len_left.setCurrentIndex(0) self.stacked_widget_context_len_right.setCurrentIndex(0) diff --git a/wordless/wl_concordancer_parallel.py b/wordless/wl_concordancer_parallel.py index 19ddbbe0b..c8c242967 100644 --- a/wordless/wl_concordancer_parallel.py +++ b/wordless/wl_concordancer_parallel.py @@ -308,14 +308,15 @@ def update_gui_table(self, err_msg, concordance_lines): self.settings = copy.deepcopy(self.main.settings_custom) self.clr_table(0) + self.model().setRowCount(len(concordance_lines)) + # Insert columns for file_name in self.main.wl_file_area.get_selected_file_names(): self.ins_header_hor( self.model().columnCount(), file_name ) - self.model().setRowCount(len(concordance_lines)) self.disable_updates() for i, concordance_line in enumerate(concordance_lines): diff --git a/wordless/wl_dependency_parser.py b/wordless/wl_dependency_parser.py index 5f1b075af..8d9e66391 100644 --- a/wordless/wl_dependency_parser.py +++ b/wordless/wl_dependency_parser.py @@ -399,8 +399,8 @@ def update_gui_table(self, err_msg, results): self.settings = copy.deepcopy(self.main.settings_custom) self.clr_table(0) - self.model().setRowCount(len(results)) + self.disable_updates() for i, ( diff --git a/wordless/wl_figs/wl_figs.py b/wordless/wl_figs/wl_figs.py index 83c0c1c95..23d0ffb52 100644 --- a/wordless/wl_figs/wl_figs.py +++ b/wordless/wl_figs/wl_figs.py @@ -61,7 +61,10 @@ def generate_line_chart( vals = numpy.array([vals for item, vals in data_files_items]) # Frequency data - if fig_settings['use_data'] == _tr('wl_figs', 'Frequency') or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data']): + if ( + fig_settings['use_data'] == _tr('wl_figs', 'Frequency') + or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data']) + ): if fig_settings['use_cumulative']: vals = numpy.cumsum(vals, axis = 0) diff --git a/wordless/wl_keyword_extractor.py b/wordless/wl_keyword_extractor.py index a89032472..18fae2aa5 100644 --- a/wordless/wl_keyword_extractor.py +++ b/wordless/wl_keyword_extractor.py @@ -317,7 +317,7 @@ def generation_settings_changed(self): settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure() settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure() - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -432,7 +432,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): try: self.settings = copy.deepcopy(self.main.settings_custom) - settings = self.main.settings_custom['keyword_extractor'] + settings = self.settings['keyword_extractor'] files_observed = list(self.main.wl_file_area.get_selected_files()) test_statistical_significance = settings['generation_settings']['test_statistical_significance'] @@ -443,6 +443,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text'] self.clr_table() + self.model().setRowCount(len(keywords_freq_files)) # Insert columns self.ins_header_hor( @@ -548,7 +549,6 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files): freq_totals = numpy.array(list(keywords_freq_files.values())).sum(axis = 0) len_files_observed = len(files_observed) - self.model().setRowCount(len(keywords_freq_files)) self.disable_updates() for i, (keyword, stats_files) in enumerate(wl_sorting.sorted_stats_files_items(keywords_stats_files)): diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index 372feaabc..0323f8cd1 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -1083,20 +1083,21 @@ def nws(main, text): # References: # https://github.com/drelhaj/OsmanReadability/blob/master/src/org/project/osman/process/Syllables.java # https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569 +RE_STRESS = re.compile(r'[\u064B\u064C\u064D\u0651]') +RE_SHORT = re.compile(r'[\u0627\u0649\?\.\!\,\s]') + def _get_num_syls_ara(word): count_short = 0 count_long = 0 - # Tashkeel: fatha, damma, kasra - tashkeel = ['\u064E', '\u064F', '\u0650'] - for i, char in enumerate(word): - if char not in tashkeel: + # Tashkeel: fatha, damma, kasra + if char not in ('\u064E', '\u064F', '\u0650'): continue # Only if a character is a tashkeel, has a successor, and is followed by an alef, waw, or yeh if i + 1 < len(word): - if word[i + 1] in ['\u0627', '\u0648', '\u064A']: + if word[i + 1] in ('\u0627', '\u0648', '\u064A'): count_long += 1 else: count_short += 1 @@ -1104,10 +1105,10 @@ def _get_num_syls_ara(word): count_short += 1 # Stress syllables: tanween fatha, tanween damma, tanween kasra, shadda - count_stress = len(re.findall(r'[\u064B\u064C\u064D\u0651]', word)) + count_stress = len(RE_STRESS.findall(word)) if count_short == 0: - word = re.sub(r'[\u0627\u0649\?\.\!\,\s]', '', word) + word = RE_SHORT.sub('', word) count_short = max(0, len(word) - 2) # Reference: https://github.com/drelhaj/OsmanReadability/blob/405b927ef3fde200fa08efe12ec2f39b8716e4be/src/org/project/osman/process/OsmanReadability.java#L259 diff --git a/wordless/wl_ngram_generator.py b/wordless/wl_ngram_generator.py index 5e068516a..6d588055b 100644 --- a/wordless/wl_ngram_generator.py +++ b/wordless/wl_ngram_generator.py @@ -504,7 +504,7 @@ def generation_settings_changed(self): settings['measure_dispersion'] = self.combo_box_measure_dispersion.get_measure() settings['measure_adjusted_freq'] = self.combo_box_measure_adjusted_freq.get_measure() - # Keyword Position + # Search term position if self.spin_box_search_term_position_max.value() == self.spin_box_search_term_position_max.maximum(): self.spin_box_search_term_position_min.setMaximum(settings['ngram_size_max']) self.spin_box_search_term_position_max.setMaximum(settings['ngram_size_max']) @@ -520,7 +520,7 @@ def generation_settings_changed(self): else: self.spin_box_allow_skipped_tokens.setEnabled(False) - # Use Data + # Use data self.combo_box_use_data.measures_changed() def table_settings_changed(self): @@ -590,7 +590,7 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): try: self.settings = copy.deepcopy(self.main.settings_custom) - settings = self.main.settings_custom['ngram_generator'] + settings = self.settings['ngram_generator'] measure_dispersion = settings['generation_settings']['measure_dispersion'] measure_adjusted_freq = settings['generation_settings']['measure_adjusted_freq'] @@ -599,6 +599,7 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] self.clr_table() + self.model().setRowCount(len(ngrams_freq_files)) # Insert columns files = list(self.main.wl_file_area.get_selected_files()) @@ -658,7 +659,6 @@ def update_gui_table(self, err_msg, ngrams_freq_files, ngrams_stats_files): freq_totals = numpy.array(list(ngrams_freq_files.values())).sum(axis = 0) len_files = len(files) - self.model().setRowCount(len(ngrams_freq_files)) self.disable_updates() for i, (ngram, freq_files) in enumerate(wl_sorting.sorted_freq_files_items(ngrams_freq_files)): diff --git a/wordless/wl_nlp/wl_sentence_tokenization.py b/wordless/wl_nlp/wl_sentence_tokenization.py index 870864298..b1b63c800 100644 --- a/wordless/wl_nlp/wl_sentence_tokenization.py +++ b/wordless/wl_nlp/wl_sentence_tokenization.py @@ -207,9 +207,11 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'): ]))) def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): + re_terminators = re.compile(fr'.+?[{terminators}]+\s|.+?$') + return [ sentence.strip() - for sentence in re.findall(fr'.+?[{terminators}]+\s|.+?$', text.strip()) + for sentence in re_terminators.findall(text.strip()) ] # Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:] @@ -290,9 +292,11 @@ def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): ]))) def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS): + re_terminators = re.compile(fr'.+?[{terminators}]+|.+?$') + return [ sentence_seg.strip() - for sentence_seg in re.findall(fr'.+?[{terminators}]+|.+?$', text.strip()) + for sentence_seg in re_terminators.findall(text.strip()) ] REPLACEMENT_CHAR = '\uFFFF' @@ -300,8 +304,9 @@ def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS) def wl_sentence_seg_tokenize_tokens(main, tokens, terminators = SENTENCE_SEG_TERMINATORS): # Insert a replacement character between tokens to prevent text from being split within tokens text = REPLACEMENT_CHAR.join(tokens) + re_terminators = re.compile(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$') return [ wl_texts.clean_texts(sentence_seg.split(REPLACEMENT_CHAR)) - for sentence_seg in re.findall(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$', text.strip()) + for sentence_seg in re_terminators.findall(text.strip()) ] diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py index 0c914b0a9..cf3e73f2c 100644 --- a/wordless/wl_nlp/wl_texts.py +++ b/wordless/wl_nlp/wl_texts.py @@ -232,7 +232,7 @@ def __init__(self, main, file): # Untokenized & Tagged elif not self.tokenized and self.tagged: # Replace all tags with a whitespace character to ensure that no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) # Remove redundant whitespace characters so that sentences are split correctly text_no_tags = re.sub(r'\s{2}', ' ', text_no_tags) @@ -244,7 +244,7 @@ def __init__(self, main, file): text = self.check_tags_text_start(text) i_tag_end = 0 - for tag in re.finditer(re_tags, text): + for tag in re_tags.finditer(text): tags_tokens = self.add_tags_tokenization(text[i_tag_end:tag.start()], tags_tokens) tags_tokens[-1].append(tag.group()) @@ -280,7 +280,7 @@ def __init__(self, main, file): if para: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', para) + text_no_tags = re_tags.sub(' ', para) for sentence in wl_sentence_tokenization.wl_sentence_split(self.main, text_no_tags): self.tokens_multilevel[-1].append([]) @@ -294,7 +294,7 @@ def __init__(self, main, file): # Extract tags i_tag_end = 0 - for tag in re.finditer(re_tags, para): + for tag in re_tags.finditer(para): tags_tokens = self.add_tags_splitting(para[i_tag_end:tag.start()], tags_tokens) tags_tokens[-1].append(tag.group()) @@ -364,7 +364,7 @@ def __init__(self, main, file): for sentence in para: for i, sentence_seg in enumerate(sentence): sentence[i] = [ - re.sub(RE_VIE_TOKENIZED, ' ', token) + RE_VIE_TOKENIZED.sub(' ', token) for token in sentence_seg ] @@ -400,7 +400,7 @@ def check_tags_text_start(self, text): re_tag_text_start = re.compile(fr"\s*({wl_matching.get_re_tags(self.main, tag_type = 'body')})") self.tags_text_start = [] - while (re_result := re.match(re_tag_text_start, text)): + while (re_result := re_tag_text_start.match(text)): tag = re_result.group() self.tags_text_start.append(tag) @@ -613,7 +613,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called # Untokenized & Tagged elif not self.tokenized and self.tagged: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) tokens = wl_word_tokenization.wl_word_tokenize_flat(self.main, text_no_tags, lang = self.lang) self.tokens_multilevel[0][0][0].extend(tokens) @@ -623,7 +623,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called # Tokenized & Tagged elif self.tokenized and self.tagged: # Replace all tags with a whitespace to ensure no words run together - text_no_tags = re.sub(re_tags, ' ', text) + text_no_tags = re_tags.sub(' ', text) self.tokens_multilevel[0][0][0].extend(text_no_tags.split()) elif file_ext == '.xml' and self.tagged: @@ -658,7 +658,7 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called for sentence in para: for i, sentence_seg in enumerate(sentence): sentence[i] = [ - re.sub(RE_VIE_TOKENIZED, ' ', token) + RE_VIE_TOKENIZED.sub(' ', token) for token in sentence_seg ] diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py index 9647896c6..4614f1845 100644 --- a/wordless/wl_nlp/wl_token_processing.py +++ b/wordless/wl_nlp/wl_token_processing.py @@ -280,8 +280,8 @@ def wl_process_tokens_profiler(main, text, token_settings, tab): return text_modified def wl_process_tokens_wordlist_generator(main, text, token_settings, generation_settings): - # Syllabification - if generation_settings['syllabification']: + # Show syllabified forms + if generation_settings['show_syllabified_forms']: text_syl_tokenize(main, text) text_modified = wl_process_tokens_ngram_generator(main, text, token_settings) diff --git a/wordless/wl_nlp/wl_word_tokenization.py b/wordless/wl_nlp/wl_word_tokenization.py index 5caf340fd..383c56dbd 100644 --- a/wordless/wl_nlp/wl_word_tokenization.py +++ b/wordless/wl_nlp/wl_word_tokenization.py @@ -28,6 +28,9 @@ from wordless.wl_nlp import wl_nlp_utils, wl_sentence_tokenization, wl_texts from wordless.wl_utils import wl_conversion, wl_misc +RE_CHAR_HAN_OTHER = re.compile(r'^h+|^o+') +RE_CHAR_HAN_KANJI_OTHER = re.compile(r'^h+|^k+|^o+') + def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): tokens_multilevel = [] @@ -126,7 +129,7 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): char_scripts += 'o' while sentence: - len_token = len(re.search(r'^h+|^o+', char_scripts).group()) + len_token = len(RE_CHAR_HAN_OTHER.search(char_scripts).group()) token = sentence[:len_token] if char_scripts.startswith('h'): @@ -174,7 +177,7 @@ def wl_word_tokenize(main, text, lang, word_tokenizer = 'default'): char_scripts += 'o' while sentence: - len_token = len(re.search(r'^h+|^k+|^o+', char_scripts).group()) + len_token = len(RE_CHAR_HAN_KANJI_OTHER.search(char_scripts).group()) token = sentence[:len_token] if char_scripts.startswith('h'): diff --git a/wordless/wl_results/wl_results_filter.py b/wordless/wl_results/wl_results_filter.py index 325886fba..eaa48392d 100644 --- a/wordless/wl_results/wl_results_filter.py +++ b/wordless/wl_results/wl_results_filter.py @@ -388,9 +388,9 @@ def __init__(self, main, table): self.col_text_adjusted_freq = self.main.settings_global['measures_adjusted_freq'][measure_adjusted_freq]['col_text'] if self.tab == 'wordlist_generator': - self.has_syllabification = settings['generation_settings']['syllabification'] + self.has_syllabified_forms = settings['generation_settings']['show_syllabified_forms'] else: - self.has_syllabification = True + self.has_syllabified_forms = True self.has_dispersion = measure_dispersion != 'none' self.has_adjusted_freq = measure_adjusted_freq != 'none' @@ -409,7 +409,7 @@ def __init__(self, main, table): settings = self.settings, filter_name = f'len_{self.type_node}' )) - if self.tab == 'wordlist_generator' and settings['generation_settings']['syllabification']: + if self.tab == 'wordlist_generator' and settings['generation_settings']['show_syllabified_forms']: self.layouts_filters.append(widgets_filter( self, label = self.tr('Number of syllables:'), @@ -474,8 +474,8 @@ def run(self): if self.dialog.tab == 'wordlist_generator': col_node = self.dialog.table.find_header_hor(self.tr('Token')) - if self.dialog.has_syllabification: - col_num_syls = self.dialog.table.find_header_hor(self.tr('Syllabification')) + if self.dialog.has_syllabified_forms: + col_num_syls = self.dialog.table.find_header_hor(self.tr('Syllabified Form')) elif self.dialog.tab == 'ngram_generator': col_node = self.dialog.table.find_header_hor(self.tr('N-gram')) @@ -543,11 +543,11 @@ def run(self): if self.dialog.table.model().item(i, col_freq).val > 0: filters.append(len_node_min <= len_node <= len_node_max) - if self.dialog.tab == 'wordlist_generator' and self.dialog.has_syllabification: + if self.dialog.tab == 'wordlist_generator' and self.dialog.has_syllabified_forms: filter_num_syls = False - syllabification = self.dialog.table.model().item(i, col_num_syls).text() + syllabified_form = self.dialog.table.model().item(i, col_num_syls).text() - for syls in syllabification.split(', '): + for syls in syllabified_form.split(', '): if num_syls_min <= len(syls.split('-')) <= num_syls_max: filter_num_syls = True diff --git a/wordless/wl_results/wl_results_sort.py b/wordless/wl_results/wl_results_sort.py index 3d43112aa..968756399 100644 --- a/wordless/wl_results/wl_results_sort.py +++ b/wordless/wl_results/wl_results_sort.py @@ -40,6 +40,9 @@ _tr = QCoreApplication.translate +RE_SORTING_COL_L = re.compile(_tr('Wl_Dialog_Results_Sort_Concordancer', r'^L[1-9][0-9]*$')) +RE_SORTING_COL_R = re.compile(_tr('Wl_Dialog_Results_Sort_Concordancer', r'^R[1-9][0-9]*$')) + class Wl_Dialog_Results_Sort_Concordancer(wl_dialogs.Wl_Dialog): def __init__(self, main, table): super().__init__( @@ -133,9 +136,6 @@ def update_gui(self, results): results[i][2] = right_new # Sort results - re_sorting_col_l = re.compile(self.tr(r'^L[1-9][0-9]*$')) - re_sorting_col_r = re.compile(self.tr(r'^R[1-9][0-9]*$')) - for sorting_col, sorting_order in reversed(self.settings['sorting_rules']): reverse = 0 if sorting_order == self.tr('Ascending') else 1 @@ -151,9 +151,9 @@ def update_gui(self, results): else: span = int(sorting_col[1:]) - if re_sorting_col_l.search(sorting_col): + if RE_SORTING_COL_L.search(sorting_col): results.sort(key = lambda item, span = span: item[0].tokens_raw[-span], reverse = reverse) - elif re_sorting_col_r.search(sorting_col): + elif RE_SORTING_COL_R.search(sorting_col): results.sort(key = lambda item, span = span: item[2].tokens_raw[span - 1], reverse = reverse) # Clear highlights before sorting the results @@ -193,7 +193,7 @@ def update_gui(self, results): i_highlight_color_right = 1 for sorting_col, _ in self.settings['sorting_rules']: - if re_sorting_col_l.search(sorting_col) and int(sorting_col[1:]) <= len(text_left): + if RE_SORTING_COL_L.search(sorting_col) and int(sorting_col[1:]) <= len(text_left): hightlight_color = highlight_colors[i_highlight_color_left % len(highlight_colors)] text_left[-int(sorting_col[1:])] = f''' @@ -203,7 +203,7 @@ def update_gui(self, results): ''' i_highlight_color_left += 1 - elif re_sorting_col_r.search(sorting_col) and int(sorting_col[1:]) - 1 < len(text_right): + elif RE_SORTING_COL_R.search(sorting_col) and int(sorting_col[1:]) - 1 < len(text_right): hightlight_color = highlight_colors[i_highlight_color_right % len(highlight_colors)] text_right[int(sorting_col[1:]) - 1] = f''' @@ -382,7 +382,7 @@ def max_left(self): max_left = max(( int(col[1:]) for col in self.cols_to_sort - if re.search(self.tr(r'^L[0-9]+$'), col) + if RE_SORTING_COL_L.search(col) )) else: max_left = 0 @@ -394,7 +394,7 @@ def max_right(self): max_right = max(( int(col[1:]) for col in self.cols_to_sort - if re.search(self.tr(r'^R[0-9]+$'), col) + if RE_SORTING_COL_R.search(col) )) else: max_right = 0 @@ -418,12 +418,12 @@ def _add_row(self, row = None, texts = None): cols_left = sorted([ int(self.model().item(i, 0).text()[1:]) for i in range(self.model().rowCount()) - if re.search(self.tr(r'^L[0-9]+$'), self.model().item(i, 0).text()) + if RE_SORTING_COL_L.search(self.model().item(i, 0).text()) ]) cols_right = sorted([ int(self.model().item(i, 0).text()[1:]) for i in range(self.model().rowCount()) - if re.search(self.tr(r'^R[0-9]+$'), self.model().item(i, 0).text()) + if RE_SORTING_COL_R.search(self.model().item(i, 0).text()) ]) if sorting_col: diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 313f0ef9a..6dcfce899 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -490,7 +490,8 @@ def init_settings_default(main): }, 'generation_settings': { - 'syllabification': True, + 'show_syllabified_forms': True, + 'measure_dispersion': 'juillands_d', 'measure_adjusted_freq': 'juillands_u' }, diff --git a/wordless/wl_settings/wl_settings_files.py b/wordless/wl_settings/wl_settings_files.py index 8bb980e57..9c34bf705 100644 --- a/wordless/wl_settings/wl_settings_files.py +++ b/wordless/wl_settings/wl_settings_files.py @@ -243,6 +243,11 @@ def apply_settings(self): return True # self.tr() does not work in inherited classes +RE_TAG_EMBEDDED = re.compile(r'^([^\w\s]|_)+\S*$') +RE_TAG_NON_EMBEDDED = re.compile(r'^([^\w\s]|_)+\S*([^\w\s]|_)+$') +RE_TAG_HTML_BRACKETS = re.compile(r'(^<)|(>$)') +RE_TAG_HTML_PARENTHESES = re.compile(r'\s\((\d+)\)') + class Wl_Table_Tags(wl_tables.Wl_Table_Add_Ins_Del_Clr): def __init__(self, parent, settings_tags, defaults_row): super().__init__( @@ -283,12 +288,12 @@ def item_changed(self, item): # pylint: disable=arguments-differ # Opening Tag if self.model().item(row, 0).text() == _tr('Wl_Table_Tags', 'Embedded'): - re_validation = re.search(r'^([^\w\s]|_)+\S*$', item_opening_tag.text()) + re_validation = RE_TAG_EMBEDDED.search(item_opening_tag.text()) warning_text = _tr('Wl_Table_Tags', '''