Skip to content

Commit

Permalink
Settings: Add Settings - Stop Word Lists - Stop Word List Settings - …
Browse files Browse the repository at this point in the history
…Case-sensitive
  • Loading branch information
BLKSerene committed Jan 10, 2024
1 parent 7cee8ed commit 84f8747
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 57 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - ??/??/2024
### 🎉 New Features
- Settings: Add Settings - Stop Word Lists - Stop Word List Settings - Case-sensitive
- Utils: Add Stanza's Sindhi part-of-speech tagger
- Utils: Add VADER's sentiment analyzers
- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic
Expand Down
5 changes: 5 additions & 0 deletions tests/tests_nlp/test_stop_word_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ def test_filter_stop_words():
assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['a', 'aa'], lang = 'eng_us') == ['aa']
assert wl_stop_word_lists.wl_filter_stop_words(main, items = [], lang = 'eng_us') == []

main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive'] = False
assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['A', 'a'], lang = 'eng_us') == []
main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive'] = True
assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['A', 'a'], lang = 'eng_us') == ['A']

if __name__ == '__main__':
for lang, stop_word_list in test_stop_word_lists:
test_get_stop_word_list(lang, stop_word_list)
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_settings/test_settings_global.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def check_settings_global(self):
settings_lemmatizers = settings_global['lemmatizers']
settings_lemmatizers_default = settings_default['lemmatization']['lemmatizer_settings']
settings_stop_word_lists = settings_global['stop_word_lists']
settings_stop_word_lists_default = settings_default['stop_word_lists']['stop_word_list_settings']
settings_stop_word_lists_default = settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists']
settings_dependency_parsers = settings_global['dependency_parsers']
settings_dependency_parsers_default = settings_default['dependency_parsing']['dependency_parser_settings']
settings_sentiment_analyzers = settings_global['sentiment_analyzers']
Expand Down
19 changes: 14 additions & 5 deletions wordless/wl_nlp/wl_stop_word_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
lang = 'other'

if stop_word_list == 'default':
stop_word_list = main.settings_custom['stop_word_lists']['stop_word_list_settings'][lang]
stop_word_list = main.settings_custom['stop_word_lists']['stop_word_list_settings']['stop_word_lists'][lang]

stop_words = []

Expand Down Expand Up @@ -113,10 +113,19 @@ def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
def wl_filter_stop_words(main, items, lang):
stop_word_list = wl_get_stop_word_list(main, lang)

# Check if the list is empty
if items:
items_filtered = [token for token in items if token not in stop_word_list]
if main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive']:
items_filtered = [
token
for token in items
if token not in stop_word_list
]
else:
items_filtered = []
stop_word_list = [token.lower() for token in stop_word_list]

items_filtered = [
token
for token in items
if token.lower() not in stop_word_list
]

return items_filtered
29 changes: 21 additions & 8 deletions wordless/wl_nlp/wl_token_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,15 +150,28 @@ def wl_process_tokens(main, text, token_settings):

i_tag = 0

for para in text.tokens_multilevel:
for sentence in para:
for sentence_seg in sentence:
for i, token in enumerate(sentence_seg):
if token in stop_words:
sentence_seg[i] = ''
text.tags[i_tag + i] = ''
if main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive']:
for para in text.tokens_multilevel:
for sentence in para:
for sentence_seg in sentence:
for i, token in enumerate(sentence_seg):
if token in stop_words:
sentence_seg[i] = ''
text.tags[i_tag + i] = ''

i_tag += len(sentence_seg)
else:
stop_words = {token.lower() for token in stop_words}

for para in text.tokens_multilevel:
for sentence in para:
for sentence_seg in sentence:
for i, token in enumerate(sentence_seg):
if token.lower() in stop_words:
sentence_seg[i] = ''
text.tags[i_tag + i] = ''

i_tag += len(sentence_seg)
i_tag += len(sentence_seg)

# Ignore tags
i_token = 0
Expand Down
80 changes: 42 additions & 38 deletions wordless/wl_settings/wl_settings_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1735,43 +1735,47 @@ def init_settings_default(main):
# Settings - Stop Word Lists
'stop_word_lists': {
'stop_word_list_settings': {
'ara': 'nltk_ara',
'aze': 'nltk_aze',
'eus': 'nltk_eus',
'ben': 'nltk_ben',
'cat': 'nltk_cat',
'zho_cn': 'nltk_zho_cn',
'zho_tw': 'nltk_zho_tw',
'dan': 'nltk_dan',
'nld': 'nltk_nld',
'eng_gb': 'nltk_eng',
'eng_us': 'nltk_eng',
'fin': 'nltk_fin',
'fra': 'nltk_fra',
'deu_at': 'nltk_deu',
'deu_de': 'nltk_deu',
'deu_ch': 'nltk_deu',
'ell': 'nltk_ell',
'heb': 'nltk_heb',
'hun': 'nltk_hun',
'ind': 'nltk_ind',
'ita': 'nltk_ita',
'kaz': 'nltk_kaz',
'lao': 'laonlp_lao',
'nep': 'nltk_nep',
'nob': 'nltk_nob',
'por_br': 'nltk_por',
'por_pt': 'nltk_por',
'ron': 'nltk_ron',
'rus': 'nltk_rus',
'slv': 'nltk_slv',
'spa': 'nltk_spa',
'swe': 'nltk_swe',
'tgk': 'nltk_tgk',
'tha': 'pythainlp_tha',
'tur': 'nltk_tur',
'stop_word_lists': {
'ara': 'nltk_ara',
'aze': 'nltk_aze',
'eus': 'nltk_eus',
'ben': 'nltk_ben',
'cat': 'nltk_cat',
'zho_cn': 'nltk_zho_cn',
'zho_tw': 'nltk_zho_tw',
'dan': 'nltk_dan',
'nld': 'nltk_nld',
'eng_gb': 'nltk_eng',
'eng_us': 'nltk_eng',
'fin': 'nltk_fin',
'fra': 'nltk_fra',
'deu_at': 'nltk_deu',
'deu_de': 'nltk_deu',
'deu_ch': 'nltk_deu',
'ell': 'nltk_ell',
'heb': 'nltk_heb',
'hun': 'nltk_hun',
'ind': 'nltk_ind',
'ita': 'nltk_ita',
'kaz': 'nltk_kaz',
'lao': 'laonlp_lao',
'nep': 'nltk_nep',
'nob': 'nltk_nob',
'por_br': 'nltk_por',
'por_pt': 'nltk_por',
'ron': 'nltk_ron',
'rus': 'nltk_rus',
'slv': 'nltk_slv',
'spa': 'nltk_spa',
'swe': 'nltk_swe',
'tgk': 'nltk_tgk',
'tha': 'pythainlp_tha',
'tur': 'nltk_tur',

'other': 'custom'
},

'other': 'custom'
'case_sensitive': False,
},

'custom_lists': {},
Expand Down Expand Up @@ -2344,8 +2348,8 @@ def init_settings_default(main):
for lang in wl_settings_global.SETTINGS_GLOBAL['langs'].values():
lang_code = lang[0]

if lang_code not in settings_default['stop_word_lists']['stop_word_list_settings']:
settings_default['stop_word_lists']['stop_word_list_settings'][lang_code] = 'custom'
if lang_code not in settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists']:
settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists'][lang_code] = 'custom'

settings_default['stop_word_lists']['custom_lists'][lang_code] = []

Expand Down
20 changes: 15 additions & 5 deletions wordless/wl_settings/wl_settings_stop_word_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import copy

from PyQt5.QtGui import QStandardItem
from PyQt5.QtWidgets import QGroupBox, QLabel
from PyQt5.QtWidgets import QCheckBox, QGroupBox, QLabel

from wordless.wl_nlp import wl_nlp_utils, wl_stop_word_lists
from wordless.wl_settings import wl_settings
Expand All @@ -45,6 +45,7 @@ def __init__(self, main):
],
editable = True
)
self.checkbox_case_sensitive = QCheckBox(self.tr('Case-sensitive'), self)

self.table_stop_word_lists.setFixedHeight(370)
self.table_stop_word_lists.verticalHeader().setHidden(True)
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(self, main):

self.group_box_stop_word_list_settings.setLayout(wl_layouts.Wl_Layout())
self.group_box_stop_word_list_settings.layout().addWidget(self.table_stop_word_lists, 0, 0)
self.group_box_stop_word_list_settings.layout().addWidget(self.checkbox_case_sensitive, 1, 0)

# Preview
self.group_box_preview = QGroupBox(self.tr('Preview'), self)
Expand Down Expand Up @@ -156,28 +158,36 @@ def load_settings(self, defaults = False):
self.table_stop_word_lists.model().item(i, 1).setText(wl_nlp_utils.to_lang_util_text(
self.main,
util_type = 'stop_word_lists',
util_code = settings['stop_word_list_settings'][lang]
util_code = settings['stop_word_list_settings']['stop_word_lists'][lang]
))

self.table_stop_word_lists.enable_updates()

self.checkbox_case_sensitive.setChecked(settings['stop_word_list_settings']['case_sensitive'])

if not defaults:
self.combo_box_preview_lang.setCurrentText(wl_conversion.to_lang_text(self.main, settings['preview']['preview_lang']))

# Custom stop word lists
if defaults:
self.settings_custom['custom_lists'] = copy.deepcopy(self.settings_default['custom_lists'])

self.combo_box_preview_lang.currentTextChanged.emit(self.combo_box_preview_lang.currentText())

def apply_settings(self):
for i, lang in enumerate(self.settings_global):
self.settings_custom['stop_word_list_settings'][lang] = wl_nlp_utils.to_lang_util_code(
self.settings_custom['stop_word_list_settings']['stop_word_lists'][lang] = wl_nlp_utils.to_lang_util_code(
self.main,
util_type = 'stop_word_lists',
util_text = self.table_stop_word_lists.model().item(i, 1).text()
)

if self.settings_custom['stop_word_list_settings'][self.settings_custom['preview']['preview_lang']] == 'custom':
self.settings_custom['custom_lists'][self.settings_custom['preview']['preview_lang']] = self.list_preview_results.model().stringList()
self.settings_custom['stop_word_list_settings']['case_sensitive'] = self.checkbox_case_sensitive.isChecked()

# Custom stop word lists
preview_lang = self.settings_custom['preview']['preview_lang']

if self.settings_custom['stop_word_list_settings']['stop_word_lists'][preview_lang] == 'custom':
self.settings_custom['custom_lists'][preview_lang] = self.list_preview_results.model().stringList()

return True

0 comments on commit 84f8747

Please sign in to comment.