keras-team · paw-lu · Jun 9, 2020 · Jun 9, 2020 · Jun 9, 2020 · Jun 9, 2020
diff --git a/keras_preprocessing/text.py b/keras_preprocessing/text.py
@@ -222,9 +222,14 @@ def fit_on_texts(self, texts):
                 a generator of strings (for memory-efficiency),
                 or a list of list of strings.
         """
+        filtered_characters = set(self.filters)
         for text in texts:
             self.document_count += 1
             if self.char_level or isinstance(text, list):
+                if not isinstance(text, list):
+                    text = "".join(char
+                                   for char in text
+                                   if char not in filtered_characters)
                 if self.lower:
                     if isinstance(text, list):
                         text = [text_elem.lower() for text_elem in text]

diff --git a/tests/text_test.py b/tests/text_test.py
@@ -184,6 +184,15 @@ def test_tokenizer_oov_flag_and_num_words():
     assert trans_text == 'this <unk> <unk> <unk> <unk> <unk>'
 
 
+@pytest.mark.parametrize("x_train", ("ae", ["ae", "er"]))
+def test_tokenizer_filter_char_level(x_train):
+    """It does not tokenize filtered characters at the character level.
+    """
+    tokenizer = text.Tokenizer(filters="e", char_level=True)
+    tokenizer.fit_on_texts(x_train)
+    assert "e" not in tokenizer.word_index
+
+
 def test_sequences_to_texts_with_num_words_and_oov_token():
     x_train = ['This text has only known words this text']
     x_test = ['This text has some unknown words']
@@ -284,9 +293,8 @@ def test_tokenizer_lower_flag():
     char_tokenizer.fit_on_texts(texts)
     expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6), (' ', 14),
                                         ('c', 2), ('a', 6), ('s', 2), ('o', 6),
-                                        ('n', 4), ('m', 1), ('.', 3), ('d', 3),
-                                        ('g', 5), ('l', 2), ('i', 2), ('v', 1),
-                                        ('r', 1)])
+                                        ('n', 4), ('m', 1), ('d', 3), ('g', 5),
+                                        ('l', 2), ('i', 2), ('v', 1), ('r', 1)])
     assert char_tokenizer.word_counts == expected_word_counts