Added a new preprocessor to filter, this really needs some continued …

…optimization but that can be done at a later date
52inc · Aug 11, 2017 · f8e9668 · f8e9668
1 parent 49cc754
commit f8e9668
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 7 deletions.
diff --git a/naughty_words/defaults.py b/naughty_words/defaults.py
@@ -1,6 +1,6 @@
 import pkgutil
 from naughty_words import NaughtyWords
-from naughty_words.preprocessors import EmptyPreprocessor
+from naughty_words.preprocessors import SubstitutionsPreprocessor
 from naughty_words.filters import CommonSubstitutions
 
 data = pkgutil.get_data(__name__, 'wordlists/profanities.txt')
@@ -15,7 +15,7 @@ def has_profanity(text, additional=None, blacklist=None, profanities=None):
         profanities = list(set(profanities).update(set(additional)))
     if blacklist:
         profanities = list(set(profanities).difference(set(blacklist)))
-    naughty_words = NaughtyWords(preprocessors=[EmptyPreprocessor()],
+    naughty_words = NaughtyWords(preprocessors=[SubstitutionsPreprocessor()],
                                  filters=[CommonSubstitutions()],
                                  profanities=profanities)
     return len(naughty_words.run_filters(text, only_first=True)) != 0
@@ -29,7 +29,7 @@ def get_all_profanity(text, additional=None, blacklist=None, profanities=None):
         profanities = list(set(profanities).update(set(additional)))
     if blacklist:
         profanities = list(set(profanities).difference(set(blacklist)))
-    naughty_words = NaughtyWords(preprocessors=[EmptyPreprocessor()],
+    naughty_words = NaughtyWords(preprocessors=[SubstitutionsPreprocessor()],
                                  filters=[CommonSubstitutions()],
                                  profanities=profanities)
     return naughty_words.run_filters(text, only_first=False)
diff --git a/naughty_words/filters/common_substitutions.py b/naughty_words/filters/common_substitutions.py
@@ -12,12 +12,12 @@ def escaped_expression(cls, characters, escaped_characters, quantifier='*?'):
             re_expressions.append(re.escape(character))
         return f"[{''.join(re_expressions)}]{quantifier}"
 
-    def profanity_expression(self, word):
+    def profanity_expression(self, word, character_substitutions):
         expression = ''
         separating_expression = self.escaped_expression(separating_characters, ['\s'])
         for character in word:
             try:
-                expression = expression + self.escaped_expression(standard_character_substitutions[character], [], '+?') + separating_expression
+                expression = expression + self.escaped_expression(character_substitutions[character], [], '+?') + separating_expression
             except KeyError:
                 expression = expression + self.escaped_expression(character, [], '+?') + separating_expression
 
@@ -28,6 +28,12 @@ def filter(self, text: str,
                only_first: bool=True,
                raise_on_match: bool= False):
         profanities = context['profanities']
+
+        try:
+            character_substitutions = context['character_substitutions']
+        except KeyError:
+            character_substitutions = standard_character_substitutions
+
         matches = []
         for profanity in profanities:
             if profanity in text:
@@ -44,7 +50,7 @@ def filter(self, text: str,
             alpha_num_word = re.sub('\W', '', profanity)
             if alpha_num_word is '':
                 continue
-            pattern = self.profanity_expression(alpha_num_word)
+            pattern = self.profanity_expression(alpha_num_word, character_substitutions)
             if re.search(pattern, text):
                 if raise_on_match:
                     raise ProfanityException()
@@ -56,3 +62,6 @@ def filter(self, text: str,
         return matches
 
 
+
+
+
diff --git a/naughty_words/preprocessors/__init__.py b/naughty_words/preprocessors/__init__.py
@@ -1 +1,2 @@
 from naughty_words.preprocessors.empty_preprocessor import *
+from naughty_words.preprocessors.substitutions_preprocessor import *
diff --git a/naughty_words/preprocessors/substitutions_preprocessor.py b/naughty_words/preprocessors/substitutions_preprocessor.py
@@ -0,0 +1,39 @@
+import re
+from naughty_words import Preprocessor
+from naughty_words.utils.confusables import standard_character_substitutions
+
+
+class SubstitutionsPreprocessor(Preprocessor):
+
+    def split(self, list):
+        return list[:len(list)//2], list[len(list)//2:]
+
+    def find_valid_subs(self, text, list_of_characters):
+        if re.search(r'{}'.format(list_of_characters), text):
+            if len(list_of_characters) > 1:
+                list_a, list_b = self.split(list_of_characters)
+                sub_list_a = self.find_valid_subs(text, list_a)
+                sub_list_b = self.find_valid_subs(text, list_b)
+
+                return sub_list_a + sub_list_b
+            else:
+                return list_of_characters
+        else:
+            return []
+
+    def process(self, text: str, context: dict):
+        try:
+            cur_substitutions = context['character_substitutions']
+        except KeyError:
+            cur_substitutions = standard_character_substitutions
+
+        final_subs = cur_substitutions.copy()
+        for key, value in cur_substitutions.items():
+            valid_subs = self.find_valid_subs(text, value)
+            if len(valid_subs) == 0:
+                final_subs.pop(key, None)
+            else:
+                final_subs[key] = valid_subs
+
+        context['character_substitutions'] = final_subs
+        return text, context
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -1,7 +1,7 @@
 import pytest
 import time
 
-from naughty_words.defaults import has_profanity
+from naughty_words.defaults import has_profanity, get_all_profanity
 
 
 def test_default_filter_no_match():
@@ -11,3 +11,10 @@ def test_default_filter_no_match():
 def test_default_filter_has_match():
     assert has_profanity("You fucking know there's profanity here") is True
 
+
+def test_get_all_profanity():
+    assert get_all_profanity("My money's in that office, right? If she start giving me some bullshit about it ain't there, and we got to go someplace else and get it, I'm gonna shoot you in the head then and there. Then I'm gonna shoot that bitch in the kneecaps, find out where my goddamn money is. She gonna tell me too. Hey, look at me when I'm talking to you, motherfucker. You listen: we go in there, and that nigga Winston or anybody else is in there, you the first motherfucker to get shot. You understand?") == {'motherfuck', 'motherfucker', 'bullshit', 'bitchin', 'tit', 'shit', 'bitch', 'goddamn', 'damn', 'fucker', 'nigga',
+     'fuck', 'mother fucker', 'god damn', 'mother-fucker'}
+
+
+
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from naughty_words.preprocessors.empty_preprocessor import *
		from naughty_words.preprocessors.substitutions_preprocessor import *