Skip to content

Commit

Permalink
changed trafilatura default args
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Dec 28, 2024
1 parent aae7e33 commit f816913
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/datatrove/pipeline/extractors/trafilatura.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ class Trafilatura(BaseExtractor):
def __init__(
self,
favour_precision: bool = True,
include_comments: bool = False,
include_images: bool = False,
timeout: float = 0.1,
deduplicate: bool = True,
**kwargs,
):
super().__init__(timeout)
self.favour_precision = favour_precision
self.include_comments = include_comments
self.include_images = include_images
self.deduplicate = deduplicate
self.kwargs = kwargs
Expand Down Expand Up @@ -65,7 +67,7 @@ def extract(self, text: str) -> str:
return extract(
text,
favor_precision=self.favour_precision,
include_comments=False,
include_comments=self.include_comments,
deduplicate=self.deduplicate,
**self.kwargs,
)

0 comments on commit f816913

Please sign in to comment.