Skip to content

Commit

Permalink
Added a new preprocessor to filter, this really needs some continued …
Browse files Browse the repository at this point in the history
…optimization but that can be done at a later date
  • Loading branch information
ulmentflam committed Aug 11, 2017
1 parent 49cc754 commit f8e9668
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 7 deletions.
6 changes: 3 additions & 3 deletions naughty_words/defaults.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pkgutil
from naughty_words import NaughtyWords
from naughty_words.preprocessors import EmptyPreprocessor
from naughty_words.preprocessors import SubstitutionsPreprocessor
from naughty_words.filters import CommonSubstitutions

data = pkgutil.get_data(__name__, 'wordlists/profanities.txt')
Expand All @@ -15,7 +15,7 @@ def has_profanity(text, additional=None, blacklist=None, profanities=None):
profanities = list(set(profanities).update(set(additional)))
if blacklist:
profanities = list(set(profanities).difference(set(blacklist)))
naughty_words = NaughtyWords(preprocessors=[EmptyPreprocessor()],
naughty_words = NaughtyWords(preprocessors=[SubstitutionsPreprocessor()],
filters=[CommonSubstitutions()],
profanities=profanities)
return len(naughty_words.run_filters(text, only_first=True)) != 0
Expand All @@ -29,7 +29,7 @@ def get_all_profanity(text, additional=None, blacklist=None, profanities=None):
profanities = list(set(profanities).update(set(additional)))
if blacklist:
profanities = list(set(profanities).difference(set(blacklist)))
naughty_words = NaughtyWords(preprocessors=[EmptyPreprocessor()],
naughty_words = NaughtyWords(preprocessors=[SubstitutionsPreprocessor()],
filters=[CommonSubstitutions()],
profanities=profanities)
return naughty_words.run_filters(text, only_first=False)
15 changes: 12 additions & 3 deletions naughty_words/filters/common_substitutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ def escaped_expression(cls, characters, escaped_characters, quantifier='*?'):
re_expressions.append(re.escape(character))
return f"[{''.join(re_expressions)}]{quantifier}"

def profanity_expression(self, word):
def profanity_expression(self, word, character_substitutions):
expression = ''
separating_expression = self.escaped_expression(separating_characters, ['\s'])
for character in word:
try:
expression = expression + self.escaped_expression(standard_character_substitutions[character], [], '+?') + separating_expression
expression = expression + self.escaped_expression(character_substitutions[character], [], '+?') + separating_expression
except KeyError:
expression = expression + self.escaped_expression(character, [], '+?') + separating_expression

Expand All @@ -28,6 +28,12 @@ def filter(self, text: str,
only_first: bool=True,
raise_on_match: bool= False):
profanities = context['profanities']

try:
character_substitutions = context['character_substitutions']
except KeyError:
character_substitutions = standard_character_substitutions

matches = []
for profanity in profanities:
if profanity in text:
Expand All @@ -44,7 +50,7 @@ def filter(self, text: str,
alpha_num_word = re.sub('\W', '', profanity)
if alpha_num_word is '':
continue
pattern = self.profanity_expression(alpha_num_word)
pattern = self.profanity_expression(alpha_num_word, character_substitutions)
if re.search(pattern, text):
if raise_on_match:
raise ProfanityException()
Expand All @@ -56,3 +62,6 @@ def filter(self, text: str,
return matches





1 change: 1 addition & 0 deletions naughty_words/preprocessors/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from naughty_words.preprocessors.empty_preprocessor import *
from naughty_words.preprocessors.substitutions_preprocessor import *
39 changes: 39 additions & 0 deletions naughty_words/preprocessors/substitutions_preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import re
from naughty_words import Preprocessor
from naughty_words.utils.confusables import standard_character_substitutions


class SubstitutionsPreprocessor(Preprocessor):

def split(self, list):
return list[:len(list)//2], list[len(list)//2:]

def find_valid_subs(self, text, list_of_characters):
if re.search(r'{}'.format(list_of_characters), text):
if len(list_of_characters) > 1:
list_a, list_b = self.split(list_of_characters)
sub_list_a = self.find_valid_subs(text, list_a)
sub_list_b = self.find_valid_subs(text, list_b)

return sub_list_a + sub_list_b
else:
return list_of_characters
else:
return []

def process(self, text: str, context: dict):
try:
cur_substitutions = context['character_substitutions']
except KeyError:
cur_substitutions = standard_character_substitutions

final_subs = cur_substitutions.copy()
for key, value in cur_substitutions.items():
valid_subs = self.find_valid_subs(text, value)
if len(valid_subs) == 0:
final_subs.pop(key, None)
else:
final_subs[key] = valid_subs

context['character_substitutions'] = final_subs
return text, context
9 changes: 8 additions & 1 deletion tests/test_filters.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pytest
import time

from naughty_words.defaults import has_profanity
from naughty_words.defaults import has_profanity, get_all_profanity


def test_default_filter_no_match():
Expand All @@ -11,3 +11,10 @@ def test_default_filter_no_match():
def test_default_filter_has_match():
assert has_profanity("You fucking know there's profanity here") is True


def test_get_all_profanity():
assert get_all_profanity("My money's in that office, right? If she start giving me some bullshit about it ain't there, and we got to go someplace else and get it, I'm gonna shoot you in the head then and there. Then I'm gonna shoot that bitch in the kneecaps, find out where my goddamn money is. She gonna tell me too. Hey, look at me when I'm talking to you, motherfucker. You listen: we go in there, and that nigga Winston or anybody else is in there, you the first motherfucker to get shot. You understand?") == {'motherfuck', 'motherfucker', 'bullshit', 'bitchin', 'tit', 'shit', 'bitch', 'goddamn', 'damn', 'fucker', 'nigga',
'fuck', 'mother fucker', 'god damn', 'mother-fucker'}



0 comments on commit f8e9668

Please sign in to comment.