diff --git a/src/datatrove/pipeline/filters/gopher_quality_filter.py b/src/datatrove/pipeline/filters/gopher_quality_filter.py index 9e5e68b6..4dc33384 100644 --- a/src/datatrove/pipeline/filters/gopher_quality_filter.py +++ b/src/datatrove/pipeline/filters/gopher_quality_filter.py @@ -119,7 +119,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: return False, "gopher_below_alpha_threshold" # stop word filter - if self.min_stop_words and sum(w in self.stop_words for w in words) < self.min_stop_words: + if self.min_stop_words and len(self.stop_words.intersection(set(words))) < self.min_stop_words: return False, "gopher_enough_stop_words" return True