From f8e78f536188b9d8047d5eb459b789b4b0f2172a Mon Sep 17 00:00:00 2001 From: guipenedo Date: Mon, 20 Jan 2025 16:49:37 +0100 Subject: [PATCH] fixes stopwors implementation... --- src/datatrove/pipeline/filters/gopher_quality_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datatrove/pipeline/filters/gopher_quality_filter.py b/src/datatrove/pipeline/filters/gopher_quality_filter.py index 9e5e68b6..4dc33384 100644 --- a/src/datatrove/pipeline/filters/gopher_quality_filter.py +++ b/src/datatrove/pipeline/filters/gopher_quality_filter.py @@ -119,7 +119,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]: return False, "gopher_below_alpha_threshold" # stop word filter - if self.min_stop_words and sum(w in self.stop_words for w in words) < self.min_stop_words: + if self.min_stop_words and len(self.stop_words.intersection(set(words))) < self.min_stop_words: return False, "gopher_enough_stop_words" return True