Settings: Add Settings - Stop Word Lists - Stop Word List Settings - …

…Case-sensitive
BLKSerene · Jan 10, 2024 · 84f8747 · 84f8747
1 parent 7cee8ed
commit 84f8747
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 57 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
 
 ## [3.5.0](https://github.com/BLKSerene/Wordless/releases/tag/3.5.0) - ??/??/2024
 ### 🎉 New Features
+- Settings: Add Settings - Stop Word Lists - Stop Word List Settings - Case-sensitive
 - Utils: Add Stanza's Sindhi part-of-speech tagger
 - Utils: Add VADER's sentiment analyzers
 - Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic

diff --git a/tests/tests_nlp/test_stop_word_lists.py b/tests/tests_nlp/test_stop_word_lists.py
@@ -47,6 +47,11 @@ def test_filter_stop_words():
     assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['a', 'aa'], lang = 'eng_us') == ['aa']
     assert wl_stop_word_lists.wl_filter_stop_words(main, items = [], lang = 'eng_us') == []
 
+    main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive'] = False
+    assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['A', 'a'], lang = 'eng_us') == []
+    main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive'] = True
+    assert wl_stop_word_lists.wl_filter_stop_words(main, items = ['A', 'a'], lang = 'eng_us') == ['A']
+
 if __name__ == '__main__':
     for lang, stop_word_list in test_stop_word_lists:
         test_get_stop_word_list(lang, stop_word_list)

diff --git a/tests/tests_settings/test_settings_global.py b/tests/tests_settings/test_settings_global.py
@@ -131,7 +131,7 @@ def check_settings_global(self):
         settings_lemmatizers = settings_global['lemmatizers']
         settings_lemmatizers_default = settings_default['lemmatization']['lemmatizer_settings']
         settings_stop_word_lists = settings_global['stop_word_lists']
-        settings_stop_word_lists_default = settings_default['stop_word_lists']['stop_word_list_settings']
+        settings_stop_word_lists_default = settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists']
         settings_dependency_parsers = settings_global['dependency_parsers']
         settings_dependency_parsers_default = settings_default['dependency_parsing']['dependency_parser_settings']
         settings_sentiment_analyzers = settings_global['sentiment_analyzers']

diff --git a/wordless/wl_nlp/wl_stop_word_lists.py b/wordless/wl_nlp/wl_stop_word_lists.py
@@ -62,7 +62,7 @@ def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
         lang = 'other'
 
     if stop_word_list == 'default':
-        stop_word_list = main.settings_custom['stop_word_lists']['stop_word_list_settings'][lang]
+        stop_word_list = main.settings_custom['stop_word_lists']['stop_word_list_settings']['stop_word_lists'][lang]
 
     stop_words = []
 
@@ -113,10 +113,19 @@ def wl_get_stop_word_list(main, lang, stop_word_list = 'default'):
 def wl_filter_stop_words(main, items, lang):
     stop_word_list = wl_get_stop_word_list(main, lang)
 
-    # Check if the list is empty
-    if items:
-        items_filtered = [token for token in items if token not in stop_word_list]
+    if main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive']:
+        items_filtered = [
+            token
+            for token in items
+            if token not in stop_word_list
+        ]
     else:
-        items_filtered = []
+        stop_word_list = [token.lower() for token in stop_word_list]
+
+        items_filtered = [
+            token
+            for token in items
+            if token.lower() not in stop_word_list
+        ]
 
     return items_filtered
diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py
@@ -150,15 +150,28 @@ def wl_process_tokens(main, text, token_settings):
 
         i_tag = 0
 
-        for para in text.tokens_multilevel:
-            for sentence in para:
-                for sentence_seg in sentence:
-                    for i, token in enumerate(sentence_seg):
-                        if token in stop_words:
-                            sentence_seg[i] = ''
-                            text.tags[i_tag + i] = ''
+        if main.settings_custom['stop_word_lists']['stop_word_list_settings']['case_sensitive']:
+            for para in text.tokens_multilevel:
+                for sentence in para:
+                    for sentence_seg in sentence:
+                        for i, token in enumerate(sentence_seg):
+                            if token in stop_words:
+                                sentence_seg[i] = ''
+                                text.tags[i_tag + i] = ''
+
+                        i_tag += len(sentence_seg)
+        else:
+            stop_words = {token.lower() for token in stop_words}
+
+            for para in text.tokens_multilevel:
+                for sentence in para:
+                    for sentence_seg in sentence:
+                        for i, token in enumerate(sentence_seg):
+                            if token.lower() in stop_words:
+                                sentence_seg[i] = ''
+                                text.tags[i_tag + i] = ''
 
-                    i_tag += len(sentence_seg)
+                        i_tag += len(sentence_seg)
 
     # Ignore tags
     i_token = 0

diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py
@@ -1735,43 +1735,47 @@ def init_settings_default(main):
         # Settings - Stop Word Lists
         'stop_word_lists': {
             'stop_word_list_settings': {
-                'ara': 'nltk_ara',
-                'aze': 'nltk_aze',
-                'eus': 'nltk_eus',
-                'ben': 'nltk_ben',
-                'cat': 'nltk_cat',
-                'zho_cn': 'nltk_zho_cn',
-                'zho_tw': 'nltk_zho_tw',
-                'dan': 'nltk_dan',
-                'nld': 'nltk_nld',
-                'eng_gb': 'nltk_eng',
-                'eng_us': 'nltk_eng',
-                'fin': 'nltk_fin',
-                'fra': 'nltk_fra',
-                'deu_at': 'nltk_deu',
-                'deu_de': 'nltk_deu',
-                'deu_ch': 'nltk_deu',
-                'ell': 'nltk_ell',
-                'heb': 'nltk_heb',
-                'hun': 'nltk_hun',
-                'ind': 'nltk_ind',
-                'ita': 'nltk_ita',
-                'kaz': 'nltk_kaz',
-                'lao': 'laonlp_lao',
-                'nep': 'nltk_nep',
-                'nob': 'nltk_nob',
-                'por_br': 'nltk_por',
-                'por_pt': 'nltk_por',
-                'ron': 'nltk_ron',
-                'rus': 'nltk_rus',
-                'slv': 'nltk_slv',
-                'spa': 'nltk_spa',
-                'swe': 'nltk_swe',
-                'tgk': 'nltk_tgk',
-                'tha': 'pythainlp_tha',
-                'tur': 'nltk_tur',
+                'stop_word_lists': {
+                    'ara': 'nltk_ara',
+                    'aze': 'nltk_aze',
+                    'eus': 'nltk_eus',
+                    'ben': 'nltk_ben',
+                    'cat': 'nltk_cat',
+                    'zho_cn': 'nltk_zho_cn',
+                    'zho_tw': 'nltk_zho_tw',
+                    'dan': 'nltk_dan',
+                    'nld': 'nltk_nld',
+                    'eng_gb': 'nltk_eng',
+                    'eng_us': 'nltk_eng',
+                    'fin': 'nltk_fin',
+                    'fra': 'nltk_fra',
+                    'deu_at': 'nltk_deu',
+                    'deu_de': 'nltk_deu',
+                    'deu_ch': 'nltk_deu',
+                    'ell': 'nltk_ell',
+                    'heb': 'nltk_heb',
+                    'hun': 'nltk_hun',
+                    'ind': 'nltk_ind',
+                    'ita': 'nltk_ita',
+                    'kaz': 'nltk_kaz',
+                    'lao': 'laonlp_lao',
+                    'nep': 'nltk_nep',
+                    'nob': 'nltk_nob',
+                    'por_br': 'nltk_por',
+                    'por_pt': 'nltk_por',
+                    'ron': 'nltk_ron',
+                    'rus': 'nltk_rus',
+                    'slv': 'nltk_slv',
+                    'spa': 'nltk_spa',
+                    'swe': 'nltk_swe',
+                    'tgk': 'nltk_tgk',
+                    'tha': 'pythainlp_tha',
+                    'tur': 'nltk_tur',
+
+                    'other': 'custom'
+                },
 
-                'other': 'custom'
+                'case_sensitive': False,
             },
 
             'custom_lists': {},
@@ -2344,8 +2348,8 @@ def init_settings_default(main):
     for lang in wl_settings_global.SETTINGS_GLOBAL['langs'].values():
         lang_code = lang[0]
 
-        if lang_code not in settings_default['stop_word_lists']['stop_word_list_settings']:
-            settings_default['stop_word_lists']['stop_word_list_settings'][lang_code] = 'custom'
+        if lang_code not in settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists']:
+            settings_default['stop_word_lists']['stop_word_list_settings']['stop_word_lists'][lang_code] = 'custom'
 
         settings_default['stop_word_lists']['custom_lists'][lang_code] = []
 

diff --git a/wordless/wl_settings/wl_settings_stop_word_lists.py b/wordless/wl_settings/wl_settings_stop_word_lists.py
@@ -19,7 +19,7 @@
 import copy
 
 from PyQt5.QtGui import QStandardItem
-from PyQt5.QtWidgets import QGroupBox, QLabel
+from PyQt5.QtWidgets import QCheckBox, QGroupBox, QLabel
 
 from wordless.wl_nlp import wl_nlp_utils, wl_stop_word_lists
 from wordless.wl_settings import wl_settings
@@ -45,6 +45,7 @@ def __init__(self, main):
             ],
             editable = True
         )
+        self.checkbox_case_sensitive = QCheckBox(self.tr('Case-sensitive'), self)
 
         self.table_stop_word_lists.setFixedHeight(370)
         self.table_stop_word_lists.verticalHeader().setHidden(True)
@@ -72,6 +73,7 @@ def __init__(self, main):
 
         self.group_box_stop_word_list_settings.setLayout(wl_layouts.Wl_Layout())
         self.group_box_stop_word_list_settings.layout().addWidget(self.table_stop_word_lists, 0, 0)
+        self.group_box_stop_word_list_settings.layout().addWidget(self.checkbox_case_sensitive, 1, 0)
 
         # Preview
         self.group_box_preview = QGroupBox(self.tr('Preview'), self)
@@ -156,28 +158,36 @@ def load_settings(self, defaults = False):
             self.table_stop_word_lists.model().item(i, 1).setText(wl_nlp_utils.to_lang_util_text(
                 self.main,
                 util_type = 'stop_word_lists',
-                util_code = settings['stop_word_list_settings'][lang]
+                util_code = settings['stop_word_list_settings']['stop_word_lists'][lang]
             ))
 
         self.table_stop_word_lists.enable_updates()
 
+        self.checkbox_case_sensitive.setChecked(settings['stop_word_list_settings']['case_sensitive'])
+
         if not defaults:
             self.combo_box_preview_lang.setCurrentText(wl_conversion.to_lang_text(self.main, settings['preview']['preview_lang']))
 
+        # Custom stop word lists
         if defaults:
             self.settings_custom['custom_lists'] = copy.deepcopy(self.settings_default['custom_lists'])
 
         self.combo_box_preview_lang.currentTextChanged.emit(self.combo_box_preview_lang.currentText())
 
     def apply_settings(self):
         for i, lang in enumerate(self.settings_global):
-            self.settings_custom['stop_word_list_settings'][lang] = wl_nlp_utils.to_lang_util_code(
+            self.settings_custom['stop_word_list_settings']['stop_word_lists'][lang] = wl_nlp_utils.to_lang_util_code(
                 self.main,
                 util_type = 'stop_word_lists',
                 util_text = self.table_stop_word_lists.model().item(i, 1).text()
             )
 
-        if self.settings_custom['stop_word_list_settings'][self.settings_custom['preview']['preview_lang']] == 'custom':
-            self.settings_custom['custom_lists'][self.settings_custom['preview']['preview_lang']] = self.list_preview_results.model().stringList()
+        self.settings_custom['stop_word_list_settings']['case_sensitive'] = self.checkbox_case_sensitive.isChecked()
+
+        # Custom stop word lists
+        preview_lang = self.settings_custom['preview']['preview_lang']
+
+        if self.settings_custom['stop_word_list_settings']['stop_word_lists'][preview_lang] == 'custom':
+            self.settings_custom['custom_lists'][preview_lang] = self.list_preview_results.model().stringList()
 
         return True