Skip to content

Commit

Permalink
Work Area: Update Wordlist Generator - Show syllabified forms
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jan 5, 2025
1 parent be676ec commit 5184838
Show file tree
Hide file tree
Showing 24 changed files with 180 additions and 149 deletions.
6 changes: 3 additions & 3 deletions tests/test_wordlist_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_wordlist_generator():
update_gui = update_gui
).run()

def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabification):
def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabified_form):
print(err_msg)
assert not err_msg

Expand All @@ -74,8 +74,8 @@ def update_gui(err_msg, tokens_freq_files, tokens_stats_files, tokens_syllabific

# Token
assert token
# Syllabification
assert tokens_syllabification[token]
# Syllabified Form
assert tokens_syllabified_form[token]
# Frequency
assert len(freq_files) == num_files_selected + 1
# Dispersion & Adjusted Frequency
Expand Down
3 changes: 1 addition & 2 deletions tests/tests_file_area/test_file_area_file_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import glob
import os
import re
import time

from PyQt5.QtCore import QObject
Expand Down Expand Up @@ -289,7 +288,7 @@ def update_gui_misc(err_msg, new_files):
for sentence in para:
for sentence_seg in sentence:
for token in sentence_seg:
assert not re.search(wl_texts.RE_VIE_TOKENIZED, token)
assert not wl_texts.RE_VIE_TOKENIZED.search(token)

if __name__ == '__main__':
test_file_area_file_types()
Expand Down
98 changes: 53 additions & 45 deletions tests/tests_nlp/test_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,31 +73,35 @@ def test_get_re_tags():
assert re_tags_body == r'_\S*(?=\s|$)|/\S*(?=\s|$)|_(?=\s|$)|</?.*?>|</?\ \*\ >|</?\ T\ AG\ >|</?>|</?\ >'
assert re_tags_xml == r'</?p>|</?s>|</?w>|</?c>|</?\ p\ p\ >|</?>|</?\ >'

assert re.search(re_tags_header, r'token<teiHeader>').group() == '<teiHeader>'
assert re.search(re_tags_header, r'</teiHeader>token').group() == '</teiHeader>'
assert re.search(re_tags_header, r'<teiHeader>token</teiHeader>').group() == '<teiHeader>'
assert re.search(re_tags_header, r'< tei Header >token</ tei Header >').group() == '< tei Header >'
assert re.search(re_tags_header, r'<>token</>').group() == '<>'
assert re.search(re_tags_header, r'< >token</ >').group() == '< >'

assert re.search(re_tags_body, r'token_TAG').group() == '_TAG'
assert re.search(re_tags_body, r'token_T_AG').group() == '_T_AG'
assert re.search(re_tags_body, r'token_').group() == '_'
assert re.search(re_tags_body, r'token/TAG').group() == '/TAG'
assert re.search(re_tags_body, r'token<TAG>').group() == '<TAG>'
assert re.search(re_tags_body, r'</TAG>token').group() == '</TAG>'
assert re.search(re_tags_body, r'< T AG >token</ T AG >').group() == '< T AG >'
assert re.search(re_tags_body, r'<TAG>token</TAG>').group() == '<TAG>'
assert re.search(re_tags_body, r'<>token</>').group() == '<>'
assert re.search(re_tags_body, r'< >token</ >').group() == '< >'
assert re.search(re_tags_body, r'< * >token</ * >').group() == '< * >'

assert re.search(re_tags_xml, r'token<p>').group() == '<p>'
assert re.search(re_tags_xml, r'</p>token').group() == '</p>'
assert re.search(re_tags_xml, r'<p>token</p>').group() == '<p>'
assert re.search(re_tags_xml, r'< p p >token</ p p >').group() == '< p p >'
assert re.search(re_tags_xml, r'<>token</>').group() == '<>'
assert re.search(re_tags_xml, r'< >token</ >').group() == '< >'
re_tags_header = re.compile(re_tags_header)
re_tags_body = re.compile(re_tags_body)
re_tags_xml = re.compile(re_tags_xml)

assert re_tags_header.search(r'token<teiHeader>').group() == '<teiHeader>'
assert re_tags_header.search(r'</teiHeader>token').group() == '</teiHeader>'
assert re_tags_header.search(r'<teiHeader>token</teiHeader>').group() == '<teiHeader>'
assert re_tags_header.search(r'< tei Header >token</ tei Header >').group() == '< tei Header >'
assert re_tags_header.search(r'<>token</>').group() == '<>'
assert re_tags_header.search(r'< >token</ >').group() == '< >'

assert re_tags_body.search(r'token_TAG').group() == '_TAG'
assert re_tags_body.search(r'token_T_AG').group() == '_T_AG'
assert re_tags_body.search(r'token_').group() == '_'
assert re_tags_body.search(r'token/TAG').group() == '/TAG'
assert re_tags_body.search(r'token<TAG>').group() == '<TAG>'
assert re_tags_body.search(r'</TAG>token').group() == '</TAG>'
assert re_tags_body.search(r'< T AG >token</ T AG >').group() == '< T AG >'
assert re_tags_body.search(r'<TAG>token</TAG>').group() == '<TAG>'
assert re_tags_body.search(r'<>token</>').group() == '<>'
assert re_tags_body.search(r'< >token</ >').group() == '< >'
assert re_tags_body.search(r'< * >token</ * >').group() == '< * >'

assert re_tags_xml.search(r'token<p>').group() == '<p>'
assert re_tags_xml.search(r'</p>token').group() == '</p>'
assert re_tags_xml.search(r'<p>token</p>').group() == '<p>'
assert re_tags_xml.search(r'< p p >token</ p p >').group() == '< p p >'
assert re_tags_xml.search(r'<>token</>').group() == '<>'
assert re_tags_xml.search(r'< >token</ >').group() == '< >'

def test_get_re_tags_with_tokens():
re_tags_header = wl_matching.get_re_tags_with_tokens(main, tag_type = 'header')
Expand All @@ -108,26 +112,30 @@ def test_get_re_tags_with_tokens():
assert re_tags_body == r'\S*_\S*(?=\s|$)|\S*/\S*(?=\s|$)|\S*_(?=\s|$)|<.*?>.*?</.*?>|<\ \*\ >.*</\ \*\ >|<\ T\ AG\ >.*</\ T\ AG\ >|<>.*</>|<\ >.*</\ >'
assert re_tags_xml == r'<p>.*</p>|<s>.*</s>|<w>.*</w>|<c>.*</c>|<\ p\ p\ >.*</\ p\ p\ >|<>.*</>|<\ >.*</\ >'

assert re.search(re_tags_header, r'token <teiHeader>token</teiHeader> token').group() == '<teiHeader>token</teiHeader>'
assert re.search(re_tags_header, r'token <teiHeader>token</teiHeader> token').group() == '<teiHeader>token</teiHeader>'
assert re.search(re_tags_header, r'token < tei Header >token</ tei Header > token').group() == '< tei Header >token</ tei Header >'
assert re.search(re_tags_header, r'token <>token</> token').group() == '<>token</>'
assert re.search(re_tags_header, r'token < >token</ > token').group() == '< >token</ >'

assert re.search(re_tags_body, r'token token_TAG token').group() == 'token_TAG'
assert re.search(re_tags_body, r'token token/TAG token').group() == 'token/TAG'
assert re.search(re_tags_body, r'token token_T_AG token').group() == 'token_T_AG'
assert re.search(re_tags_body, r'token token_ token').group() == 'token_'
assert re.search(re_tags_body, r'token <TAG>token</TAG> token').group() == '<TAG>token</TAG>'
assert re.search(re_tags_body, r'token < T AG >token</ T AG > token').group() == '< T AG >token</ T AG >'
assert re.search(re_tags_body, r'token <>token</> token').group() == '<>token</>'
assert re.search(re_tags_body, r'token < >token</ > token').group() == '< >token</ >'
assert re.search(re_tags_body, r'token < * >token</ * > token').group() == '< * >token</ * >'

assert re.search(re_tags_xml, r'token <p>token</p> token').group() == '<p>token</p>'
assert re.search(re_tags_xml, r'token < p p >token</ p p > token').group() == '< p p >token</ p p >'
assert re.search(re_tags_xml, r'token <>token</> token').group() == '<>token</>'
assert re.search(re_tags_xml, r'token < >token</ > token').group() == '< >token</ >'
re_tags_header = re.compile(re_tags_header)
re_tags_body = re.compile(re_tags_body)
re_tags_xml = re.compile(re_tags_xml)

assert re_tags_header.search(r'token <teiHeader>token</teiHeader> token').group() == '<teiHeader>token</teiHeader>'
assert re_tags_header.search(r'token <teiHeader>token</teiHeader> token').group() == '<teiHeader>token</teiHeader>'
assert re_tags_header.search(r'token < tei Header >token</ tei Header > token').group() == '< tei Header >token</ tei Header >'
assert re_tags_header.search(r'token <>token</> token').group() == '<>token</>'
assert re_tags_header.search(r'token < >token</ > token').group() == '< >token</ >'

assert re_tags_body.search(r'token token_TAG token').group() == 'token_TAG'
assert re_tags_body.search(r'token token/TAG token').group() == 'token/TAG'
assert re_tags_body.search(r'token token_T_AG token').group() == 'token_T_AG'
assert re_tags_body.search(r'token token_ token').group() == 'token_'
assert re_tags_body.search(r'token <TAG>token</TAG> token').group() == '<TAG>token</TAG>'
assert re_tags_body.search(r'token < T AG >token</ T AG > token').group() == '< T AG >token</ T AG >'
assert re_tags_body.search(r'token <>token</> token').group() == '<>token</>'
assert re_tags_body.search(r'token < >token</ > token').group() == '< >token</ >'
assert re_tags_body.search(r'token < * >token</ * > token').group() == '< * >token</ * >'

assert re_tags_xml.search(r'token <p>token</p> token').group() == '<p>token</p>'
assert re_tags_xml.search(r'token < p p >token</ p p > token').group() == '< p p >token</ p p >'
assert re_tags_xml.search(r'token <>token</> token').group() == '<>token</>'
assert re_tags_xml.search(r'token < >token</ > token').group() == '< >token</ >'

def init_token_settings(assign_pos_tags = False, ignore_tags = False, use_tags = False):
return {
Expand Down
11 changes: 5 additions & 6 deletions wordless/wl_colligation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def generation_settings_changed(self):
settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure()
settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure()

# Use Data
# Use data
self.combo_box_use_data.measures_changed()

def table_settings_changed(self):
Expand Down Expand Up @@ -573,9 +573,7 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats
try:
self.settings = copy.deepcopy(self.main.settings_custom)

self.clr_table()

settings = self.main.settings_custom['colligation_extractor']
settings = self.settings['colligation_extractor']

test_statistical_significance = settings['generation_settings']['test_statistical_significance']
measure_bayes_factor = settings['generation_settings']['measure_bayes_factor']
Expand All @@ -584,6 +582,9 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats
col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text']
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']

self.clr_table()
self.model().setRowCount(len(colligations_freqs_files))

# Insert columns
files = list(self.main.wl_file_area.get_selected_files())
files_with_total = files + [{'name': self.tr('Total')}]
Expand Down Expand Up @@ -723,8 +724,6 @@ def update_gui_table(self, err_msg, colligations_freqs_files, colligations_stats
freq_totals = numpy.array(list(colligations_freqs_files.values())).sum(axis = 2).sum(axis = 0)
len_files = len(files)

self.model().setRowCount(len(colligations_freqs_files))

self.disable_updates()

for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(colligations_stats_files)):
Expand Down
10 changes: 5 additions & 5 deletions wordless/wl_collocation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ def generation_settings_changed(self):
settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure()
settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure()

# Use Data
# Use data
self.combo_box_use_data.measures_changed()

def table_settings_changed(self):
Expand Down Expand Up @@ -575,9 +575,7 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats
try:
self.settings = copy.deepcopy(self.main.settings_custom)

self.clr_table()

settings = self.main.settings_custom['collocation_extractor']
settings = self.settings['collocation_extractor']

test_statistical_significance = settings['generation_settings']['test_statistical_significance']
measure_bayes_factor = settings['generation_settings']['measure_bayes_factor']
Expand All @@ -586,6 +584,9 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats
col_text_test_stat = self.main.settings_global['tests_statistical_significance'][test_statistical_significance]['col_text']
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']

self.clr_table()
self.model().setRowCount(len(collocations_freqs_files))

# Insert columns
files = list(self.main.wl_file_area.get_selected_files())
files_with_total = files + [{'name': self.tr('Total')}]
Expand Down Expand Up @@ -725,7 +726,6 @@ def update_gui_table(self, err_msg, collocations_freqs_files, collocations_stats
freq_totals = numpy.array(list(collocations_freqs_files.values())).sum(axis = 2).sum(axis = 0)
len_files = len(files)

self.model().setRowCount(len(collocations_freqs_files))
self.disable_updates()

for i, ((node, collocate), stats_files) in enumerate(wl_sorting.sorted_stats_files_items(collocations_stats_files)):
Expand Down
2 changes: 1 addition & 1 deletion wordless/wl_concordancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def generation_settings_changed(self):
settings['context_len_right_para'] = self.spin_box_context_len_right_para.value()
settings['context_len_unit'] = self.combo_box_context_len_unit.currentText()

# Width Unit
# Unit of context length
if settings['context_len_unit'] == self.tr('Character'):
self.stacked_widget_context_len_left.setCurrentIndex(0)
self.stacked_widget_context_len_right.setCurrentIndex(0)
Expand Down
3 changes: 2 additions & 1 deletion wordless/wl_concordancer_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,15 @@ def update_gui_table(self, err_msg, concordance_lines):
self.settings = copy.deepcopy(self.main.settings_custom)

self.clr_table(0)
self.model().setRowCount(len(concordance_lines))

# Insert columns
for file_name in self.main.wl_file_area.get_selected_file_names():
self.ins_header_hor(
self.model().columnCount(),
file_name
)

self.model().setRowCount(len(concordance_lines))
self.disable_updates()

for i, concordance_line in enumerate(concordance_lines):
Expand Down
2 changes: 1 addition & 1 deletion wordless/wl_dependency_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,8 @@ def update_gui_table(self, err_msg, results):
self.settings = copy.deepcopy(self.main.settings_custom)

self.clr_table(0)

self.model().setRowCount(len(results))

self.disable_updates()

for i, (
Expand Down
5 changes: 4 additions & 1 deletion wordless/wl_figs/wl_figs.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ def generate_line_chart(
vals = numpy.array([vals for item, vals in data_files_items])

# Frequency data
if fig_settings['use_data'] == _tr('wl_figs', 'Frequency') or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data']):
if (
fig_settings['use_data'] == _tr('wl_figs', 'Frequency')
or re.search(_tr('wl_figs', r'^[LR][1-9][0-9]*$'), fig_settings['use_data'])
):
if fig_settings['use_cumulative']:
vals = numpy.cumsum(vals, axis = 0)

Expand Down
6 changes: 3 additions & 3 deletions wordless/wl_keyword_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def generation_settings_changed(self):
settings['measure_bayes_factor'] = self.combo_box_measure_bayes_factor.get_measure()
settings['measure_effect_size'] = self.combo_box_measure_effect_size.get_measure()

# Use Data
# Use data
self.combo_box_use_data.measures_changed()

def table_settings_changed(self):
Expand Down Expand Up @@ -432,7 +432,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
try:
self.settings = copy.deepcopy(self.main.settings_custom)

settings = self.main.settings_custom['keyword_extractor']
settings = self.settings['keyword_extractor']
files_observed = list(self.main.wl_file_area.get_selected_files())

test_statistical_significance = settings['generation_settings']['test_statistical_significance']
Expand All @@ -443,6 +443,7 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
col_text_effect_size = self.main.settings_global['measures_effect_size'][measure_effect_size]['col_text']

self.clr_table()
self.model().setRowCount(len(keywords_freq_files))

# Insert columns
self.ins_header_hor(
Expand Down Expand Up @@ -548,7 +549,6 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
freq_totals = numpy.array(list(keywords_freq_files.values())).sum(axis = 0)
len_files_observed = len(files_observed)

self.model().setRowCount(len(keywords_freq_files))
self.disable_updates()

for i, (keyword, stats_files) in enumerate(wl_sorting.sorted_stats_files_items(keywords_stats_files)):
Expand Down
15 changes: 8 additions & 7 deletions wordless/wl_measures/wl_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,31 +1083,32 @@ def nws(main, text):
# References:
# https://github.com/drelhaj/OsmanReadability/blob/master/src/org/project/osman/process/Syllables.java
# https://github.com/textstat/textstat/blob/9bf37414407bcaaa45c498478ee383c8738e5d0c/textstat/textstat.py#L569
RE_STRESS = re.compile(r'[\u064B\u064C\u064D\u0651]')
RE_SHORT = re.compile(r'[\u0627\u0649\?\.\!\,\s]')

def _get_num_syls_ara(word):
count_short = 0
count_long = 0

# Tashkeel: fatha, damma, kasra
tashkeel = ['\u064E', '\u064F', '\u0650']

for i, char in enumerate(word):
if char not in tashkeel:
# Tashkeel: fatha, damma, kasra
if char not in ('\u064E', '\u064F', '\u0650'):
continue

# Only if a character is a tashkeel, has a successor, and is followed by an alef, waw, or yeh
if i + 1 < len(word):
if word[i + 1] in ['\u0627', '\u0648', '\u064A']:
if word[i + 1] in ('\u0627', '\u0648', '\u064A'):
count_long += 1
else:
count_short += 1
else:
count_short += 1

# Stress syllables: tanween fatha, tanween damma, tanween kasra, shadda
count_stress = len(re.findall(r'[\u064B\u064C\u064D\u0651]', word))
count_stress = len(RE_STRESS.findall(word))

if count_short == 0:
word = re.sub(r'[\u0627\u0649\?\.\!\,\s]', '', word)
word = RE_SHORT.sub('', word)
count_short = max(0, len(word) - 2)

# Reference: https://github.com/drelhaj/OsmanReadability/blob/405b927ef3fde200fa08efe12ec2f39b8716e4be/src/org/project/osman/process/OsmanReadability.java#L259
Expand Down
Loading

0 comments on commit 5184838

Please sign in to comment.