Skip to content

Commit

Permalink
File Area: Update Observed/Reference Corpora
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Jan 2, 2025
1 parent bac07bd commit c3d9ba0
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 57 deletions.
10 changes: 5 additions & 5 deletions doc/doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ You can generate line charts or word clouds for patterns of colligation using an
> [!NOTE]
> Renamed from **Keyword** to **Keyword Extractor** in *Wordless* 2.2
In *Keyword Extractor*, you can search for candidates of potential keywords (tokens that have far more or far less frequency in the observed file than in the reference file) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.
In *Keyword Extractor*, you can search for candidates of potential keywords (tokens that have far more or far less frequency in the observed corpus than in the reference corpus) in different files given a reference corpus, conduct different tests of statistical significance on each keyword and calculate the Bayes factor and effect size for each keyword using different measures. You can adjust the settings for the generated data via **Generation Settings**. You can disable the calculation of statistical significance and/or Bayes factor and/or effect size by setting **Generation Settings → Test of Statistical Significance / Measures of Bayes Factor / Measure of Effect Size** to **None**.

You can filter the results by clicking **Filter results** or search in *Data Table* for parts that might be of interest to you by clicking **Search in results**.

Expand All @@ -621,11 +621,11 @@ You can generate line charts or word clouds for keywords using any statistics. Y
- **11.2 Keyword**<br>
The potential keyword. You can specify what should be counted as a "token" via **Token Settings**.

- **11.3 Frequency (in Reference File)**<br>
The number of occurrences of the keyword in the reference file.
- **11.3 Frequency (in Reference Corpora)**<br>
The number of occurrences of the keyword in reference corpora.

- **11.4 Frequency (in Observed Files)**<br>
The number of occurrences of the keyword in each observed file.
- **11.4 Frequency (in Observed Corpus)**<br>
The number of occurrences of the keyword in each observed corpus.

- **11.5 Test Statistic**<br>
The test statistic of the significance test conducted on the keyword in each file. You can change the test of statistical significance used via **Generation Settings → Test of Statistical Significance**. See section [12.4.4 Tests of Statistical Significance, Measures of Bayes Factor, & Measures of Effect Size](#doc-12-4-4) for more details.
Expand Down
38 changes: 28 additions & 10 deletions wordless/wl_file_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ def __init__(self, main, file_type = 'observed'):

# Suffix for settings
if self.file_type == 'observed':
self.tab = 'corpora_observed'
self.settings_suffix = ''
elif self.file_type == 'ref':
self.tab = 'corpora_ref'
self.settings_suffix = '_ref'

# Table
Expand Down Expand Up @@ -237,17 +239,33 @@ def __init__(self, parent):
self.clicked.connect(self.item_clicked)

# Menu
self.main.action_file_open_files.triggered.connect(lambda: self.check_file_area(self.open_files))
self.main.action_file_reopen.triggered.connect(lambda: self.check_file_area(self.reopen))
self.main.action_file_open_files.triggered.connect(
lambda: self.check_file_area(self.open_files)
)
self.main.action_file_reopen.triggered.connect(
lambda: self.check_file_area(self.reopen)
)

self.main.action_file_select_all.triggered.connect(lambda: self.check_file_area(self.horizontalHeader().select_all))
self.main.action_file_deselect_all.triggered.connect(lambda: self.check_file_area(self.horizontalHeader().deselect_all))
self.main.action_file_invert_selection.triggered.connect(lambda: self.check_file_area(self.horizontalHeader().invert_selection))
self.main.action_file_select_all.triggered.connect(
lambda: self.check_file_area(self.horizontalHeader().select_all)
)
self.main.action_file_deselect_all.triggered.connect(
lambda: self.check_file_area(self.horizontalHeader().deselect_all)
)
self.main.action_file_invert_selection.triggered.connect(
lambda: self.check_file_area(self.horizontalHeader().invert_selection)
)

self.main.action_file_close_selected.triggered.connect(lambda: self.check_file_area(self.close_selected))
self.main.action_file_close_all.triggered.connect(lambda: self.check_file_area(self.close_all))
self.main.action_file_close_selected.triggered.connect(
lambda: self.check_file_area(self.close_selected)
)
self.main.action_file_close_all.triggered.connect(
lambda: self.check_file_area(self.close_all)
)

self.main.tabs_file_area.currentChanged.connect(lambda: self.check_file_area(self.model().itemChanged.emit, self.model().item(0, 0)))
self.main.tabs_file_area.currentChanged.connect(
lambda: self.check_file_area(self.model().itemChanged.emit, self.model().item(0, 0))
)

def item_changed(self):
super().item_changed()
Expand Down Expand Up @@ -391,10 +409,10 @@ def check_file_area(self, op, *args, **kwargs):
if (
(
self.file_type == 'observed'
and self.main.tabs_file_area.tabText(self.main.tabs_file_area.currentIndex()) == self.tr('Observed Files')
and self.main.tabs_file_area.tabText(self.main.tabs_file_area.currentIndex()) == self.tr('Observed Corpora')
) or (
self.file_type == 'ref'
and self.main.tabs_file_area.tabText(self.main.tabs_file_area.currentIndex()) == self.tr('Reference Files')
and self.main.tabs_file_area.tabText(self.main.tabs_file_area.currentIndex()) == self.tr('Reference Corpora')
)
):
return op(*args, **kwargs)
Expand Down
56 changes: 28 additions & 28 deletions wordless/wl_keyword_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def __init__(self, parent):

self.main.wl_file_area_ref.table_files.model().itemChanged.connect(self.file_changed)

# Enable the buttons and prompt the user if there are only observed files or only reference files
# Enable the buttons and prompt the user if there are only observed corpora or only reference corpora
def file_changed(self):
if list(self.main.wl_file_area.get_selected_files()) or list(self.main.wl_file_area_ref.get_selected_files()):
self.button_generate_table.setEnabled(True)
Expand All @@ -373,29 +373,29 @@ def file_changed(self):
self.button_generate_table.setEnabled(False)
self.button_generate_fig.setEnabled(False)

def wl_msg_box_missing_files_observed(self):
def wl_msg_box_missing_corpus_observed(self):
wl_msg_boxes.Wl_Msg_Box_Warning(
self.main,
title = self.tr('Missing Observed Files'),
title = self.tr('Missing Observed Corpus'),
text = self.tr('''
<div>You have not specified any observed files yet.</div>
<div>You have not specified any observed corpus yet.</div>
''')
).open()

def wl_msg_box_missing_files_ref(self):
def wl_msg_box_missing_corpus_ref(self):
wl_msg_boxes.Wl_Msg_Box_Warning(
self.main,
title = self.tr('Missing Reference Files'),
title = self.tr('Missing Reference Corpus'),
text = self.tr('''
<div>You have not specified any reference files yet.</div>
<div>You have not specified any reference corpus yet.</div>
''')
).open()

def wl_status_bar_msg_missing_files_observed(self):
self.main.statusBar().showMessage(self.tr('Missing observed files!'))
def wl_status_bar_msg_missing_corpus_observed(self):
self.main.statusBar().showMessage(self.tr('Missing observed corpus!'))

def wl_status_bar_msg_missing_files_ref(self):
self.main.statusBar().showMessage(self.tr('Missing reference files!'))
def wl_status_bar_msg_missing_corpus_ref(self):
self.main.statusBar().showMessage(self.tr('Missing reference corpus!'))

@wl_misc.log_time
def generate_table(self):
Expand All @@ -421,11 +421,11 @@ def generate_table(self):
wl_threading.Wl_Thread(worker_keyword_extractor_table).start_worker()
else:
if not files_observed:
self.wl_msg_box_missing_files_observed()
self.wl_status_bar_msg_missing_files_observed()
self.wl_msg_box_missing_corpus_observed()
self.wl_status_bar_msg_missing_corpus_observed()
elif not files_ref:
self.wl_msg_box_missing_files_ref()
self.wl_status_bar_msg_missing_files_ref()
self.wl_msg_box_missing_corpus_ref()
self.wl_status_bar_msg_missing_corpus_ref()

def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
if wl_checks_work_area.check_results(self.main, err_msg, keywords_freq_files):
Expand All @@ -447,12 +447,12 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
# Insert columns
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[Reference Files]\nFrequency'),
self.tr('[Reference Corpora]\nFrequency'),
is_int = True, is_cum = True
)
self.ins_header_hor(
self.model().columnCount() - 2,
self.tr('[Reference Files]\nFrequency %'),
self.tr('[Reference Corpora]\nFrequency %'),
is_pct = True, is_cum = True
)

Expand Down Expand Up @@ -507,25 +507,25 @@ def update_gui_table(self, err_msg, keywords_freq_files, keywords_stats_files):
is_breakdown_file = is_breakdown_file
)

# Sort by p-value of the first observed file
# Sort by p-value of the first observed corpus
if test_statistical_significance != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\np-value').format(files_observed[0]['name'])),
Qt.AscendingOrder
)
# Sort by bayes factor of the first observed file
# Sort by bayes factor of the first observed corpus
elif measure_bayes_factor != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\nBayes Factor').format(files_observed[0]['name'])),
Qt.DescendingOrder
)
# Sort by effect size of the first observed file
# Sort by effect size of the first observed corpus
elif measure_effect_size != 'none':
self.horizontalHeader().setSortIndicator(
self.find_header_hor(f"[{files_observed[0]['name']}]\n{col_text_effect_size}"),
Qt.DescendingOrder
)
# Otherwise sort by frequency of the first observed file
# Otherwise sort by frequency of the first observed corpus
else:
self.horizontalHeader().setSortIndicator(
self.find_header_hor(self.tr('[{}]\nFrequency').format(files_observed[0]['name'])),
Expand Down Expand Up @@ -624,11 +624,11 @@ def generate_fig(self):
wl_threading.Wl_Thread(self.worker_keyword_extractor_fig).start_worker()
else:
if not files_observed:
self.wl_msg_box_missing_files_observed()
self.wl_status_bar_msg_missing_files_observed()
self.wl_msg_box_missing_corpus_observed()
self.wl_status_bar_msg_missing_corpus_observed()
elif not files_ref:
self.wl_msg_box_missing_files_ref()
self.wl_status_bar_msg_missing_files_ref()
self.wl_msg_box_missing_corpus_ref()
self.wl_status_bar_msg_missing_corpus_ref()

def update_gui_fig(self, err_msg, keywords_freq_files, keywords_stats_files):
if wl_checks_work_area.check_results(self.main, err_msg, keywords_freq_files):
Expand Down Expand Up @@ -700,7 +700,7 @@ def run(self):
files_observed = list(self.main.wl_file_area.get_selected_files())
files_ref = list(self.main.wl_file_area_ref.get_selected_files())

# Frequency (Reference files)
# Frequency (Reference Corpora)
self.keywords_freq_files.append(collections.Counter())
tokens_ref = []

Expand All @@ -720,7 +720,7 @@ def run(self):

len_tokens_ref = len(tokens_ref)

# Frequency (Observed files)
# Frequency (Observed Corpus)
for file_observed in files_observed:
text = wl_token_processing.wl_process_tokens_ngram_generator(
self.main, file_observed['text'],
Expand All @@ -741,7 +741,7 @@ def run(self):

self.keywords_freq_files.append(sum(self.keywords_freq_files[1:], collections.Counter()))

# Remove tokens that do not appear in any of the observed files
# Remove tokens that do not appear in any observed corpus
self.keywords_freq_files[0] = {
token: freq
for token, freq in self.keywords_freq_files[0].items()
Expand Down
24 changes: 12 additions & 12 deletions wordless/wl_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def __init__(self, loading_window):
# Menu
self.init_menu()

# Work area & File area
# Work Area & File Area
self.init_central_widget()

# Status bar
Expand Down Expand Up @@ -425,13 +425,13 @@ def init_central_widget(self):
self.wl_file_area = wl_file_area.Wrapper_File_Area(self)
self.wl_file_area_ref = wl_file_area.Wrapper_File_Area(self, file_type = 'ref')

# File area
self.tabs_file_area.addTab(self.wl_file_area, self.tr('Observed Files'))
self.tabs_file_area.addTab(self.wl_file_area_ref, self.tr('Reference Files'))
# File Area
self.tabs_file_area.addTab(self.wl_file_area, self.tr('Observed Corpora'))
self.tabs_file_area.addTab(self.wl_file_area_ref, self.tr('Reference Corpora'))

self.tabs_file_area.currentChanged.connect(self.file_area_changed)

# Work area
# Work Area
self.init_work_area()

# Splitter
Expand Down Expand Up @@ -518,18 +518,18 @@ def load_settings(self):
# Layouts
self.centralWidget().setSizes(settings['menu']['prefs']['layouts']['central_widget'])

# File area
# File Area
for i in range(self.tabs_file_area.count()):
if self.tabs_file_area.tabText(i) == self.settings_custom['file_area_cur']:
if self.tabs_file_area.widget(i).tab == self.settings_custom['tab_file_area']:
self.tabs_file_area.setCurrentIndex(i)

break

self.tabs_file_area.currentWidget().table_files.model().itemChanged.emit(QStandardItem())

# Work area
# Work Area
for i in range(self.wl_work_area.count()):
if self.wl_work_area.widget(i).tab == self.settings_custom['work_area_cur']:
if self.wl_work_area.widget(i).tab == self.settings_custom['tab_work_area']:
self.wl_work_area.setCurrentIndex(i)

break
Expand All @@ -538,14 +538,14 @@ def load_settings(self):

def file_area_changed(self):
# Current tab
self.settings_custom['file_area_cur'] = self.tabs_file_area.tabText(self.tabs_file_area.currentIndex())
self.settings_custom['tab_file_area'] = self.tabs_file_area.currentWidget().tab

def work_area_changed(self):
# Current tab
self.settings_custom['work_area_cur'] = self.wl_work_area.currentWidget().tab
self.settings_custom['tab_work_area'] = self.wl_work_area.currentWidget().tab

# File Area
if self.settings_custom['work_area_cur'] == 'keyword_extractor':
if self.settings_custom['tab_work_area'] == 'keyword_extractor':
self.tabs_file_area.tabBar().show()
else:
self.tabs_file_area.setCurrentIndex(0)
Expand Down
4 changes: 2 additions & 2 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ def init_settings_default(main):

settings_default = {
'1st_startup': True,
'file_area_cur': _tr('wl_settings_default', 'Observed Files'),
'work_area_cur': 'profiler',
'tab_file_area': 'corpora_observed',
'tab_work_area': 'profiler',

'menu': {
'prefs': {
Expand Down

0 comments on commit c3d9ba0

Please sign in to comment.