-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a new preprocessor to filter, this really needs some continued …
…optimization but that can be done at a later date
- Loading branch information
1 parent
49cc754
commit f8e9668
Showing
5 changed files
with
63 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from naughty_words.preprocessors.empty_preprocessor import * | ||
from naughty_words.preprocessors.substitutions_preprocessor import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import re | ||
from naughty_words import Preprocessor | ||
from naughty_words.utils.confusables import standard_character_substitutions | ||
|
||
|
||
class SubstitutionsPreprocessor(Preprocessor): | ||
|
||
def split(self, list): | ||
return list[:len(list)//2], list[len(list)//2:] | ||
|
||
def find_valid_subs(self, text, list_of_characters): | ||
if re.search(r'{}'.format(list_of_characters), text): | ||
if len(list_of_characters) > 1: | ||
list_a, list_b = self.split(list_of_characters) | ||
sub_list_a = self.find_valid_subs(text, list_a) | ||
sub_list_b = self.find_valid_subs(text, list_b) | ||
|
||
return sub_list_a + sub_list_b | ||
else: | ||
return list_of_characters | ||
else: | ||
return [] | ||
|
||
def process(self, text: str, context: dict): | ||
try: | ||
cur_substitutions = context['character_substitutions'] | ||
except KeyError: | ||
cur_substitutions = standard_character_substitutions | ||
|
||
final_subs = cur_substitutions.copy() | ||
for key, value in cur_substitutions.items(): | ||
valid_subs = self.find_valid_subs(text, value) | ||
if len(valid_subs) == 0: | ||
final_subs.pop(key, None) | ||
else: | ||
final_subs[key] = valid_subs | ||
|
||
context['character_substitutions'] = final_subs | ||
return text, context |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters