Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Tokenizer respects filters when char_level is True #302

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions keras_preprocessing/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,14 @@ def fit_on_texts(self, texts):
a generator of strings (for memory-efficiency),
or a list of list of strings.
"""
filtered_characters = set(self.filters)
for text in texts:
self.document_count += 1
if self.char_level or isinstance(text, list):
if not isinstance(text, list):
text = "".join(char
for char in text
if char not in filtered_characters)
if self.lower:
if isinstance(text, list):
text = [text_elem.lower() for text_elem in text]
Expand Down
14 changes: 11 additions & 3 deletions tests/text_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ def test_tokenizer_oov_flag_and_num_words():
assert trans_text == 'this <unk> <unk> <unk> <unk> <unk>'


@pytest.mark.parametrize("x_train", ("ae", ["ae", "er"]))
def test_tokenizer_filter_char_level(x_train):
"""It does not tokenize filtered characters at the character level.
"""
tokenizer = text.Tokenizer(filters="e", char_level=True)
tokenizer.fit_on_texts(x_train)
assert "e" not in tokenizer.word_index


def test_sequences_to_texts_with_num_words_and_oov_token():
x_train = ['This text has only known words this text']
x_test = ['This text has some unknown words']
Expand Down Expand Up @@ -284,9 +293,8 @@ def test_tokenizer_lower_flag():
char_tokenizer.fit_on_texts(texts)
expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6), (' ', 14),
('c', 2), ('a', 6), ('s', 2), ('o', 6),
('n', 4), ('m', 1), ('.', 3), ('d', 3),
('g', 5), ('l', 2), ('i', 2), ('v', 1),
('r', 1)])
('n', 4), ('m', 1), ('d', 3), ('g', 5),
('l', 2), ('i', 2), ('v', 1), ('r', 1)])
assert char_tokenizer.word_counts == expected_word_counts


Expand Down