From c9f1c2b7bf6ecbf90012591a9ffe45090aed7fa3 Mon Sep 17 00:00:00 2001 From: garrethlee Date: Sat, 21 Dec 2024 23:35:09 +0000 Subject: [PATCH] style: fixed lint errors --- src/datatrove/pipeline/extractors/justext.py | 2 +- src/datatrove/pipeline/extractors/readabilipy.py | 4 ++-- src/datatrove/pipeline/extractors/resiliparse.py | 3 ++- src/datatrove/pipeline/extractors/trafilatura.py | 5 +++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py index 05e71914..43e0116a 100644 --- a/src/datatrove/pipeline/extractors/justext.py +++ b/src/datatrove/pipeline/extractors/justext.py @@ -56,7 +56,7 @@ def get_stoplist(lang: str = "English") -> list[str]: from justext import get_stoplist return get_stoplist(lang) - + def clean_html(self, html: str) -> str: """ diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py index c5e345b7..8a0f32df 100644 --- a/src/datatrove/pipeline/extractors/readabilipy.py +++ b/src/datatrove/pipeline/extractors/readabilipy.py @@ -40,7 +40,7 @@ def clean_html(self, html: str) -> str: Returns: cleaned HTML """ - from readabilipy import simple_tree_from_html_string + from readabilipy import simple_tree_from_html_string result = simple_tree_from_html_string(html) return str(result) @@ -53,7 +53,7 @@ def extract(self, text: str) -> str: Returns: plaintext extracted text """ - from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text + from readabilipy.simple_json import extract_text_blocks_as_plain_text, plain_content cleaned_html = self.clean_html(text) diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py index 574d44de..008fe3d0 100644 --- a/src/datatrove/pipeline/extractors/resiliparse.py +++ b/src/datatrove/pipeline/extractors/resiliparse.py @@ -1,6 +1,7 @@ -from .base import BaseExtractor from datatrove.utils.logging import logger +from .base import BaseExtractor + class Resiliparse(BaseExtractor): """ diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py index d02cecd1..12cf8efd 100644 --- a/src/datatrove/pipeline/extractors/trafilatura.py +++ b/src/datatrove/pipeline/extractors/trafilatura.py @@ -34,7 +34,7 @@ def __init__( self.kwargs = kwargs if self.include_images: raise NotImplementedError - + def clean_html(self, html: str) -> str: """ @@ -43,9 +43,10 @@ def clean_html(self, html: str) -> str: Returns: cleaned HTML """ - from trafilatura import bare_extraction from xml.etree import ElementTree + from trafilatura import bare_extraction + html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body'] cleaned_html = ElementTree.tostring(html_body, encoding = "unicode") return cleaned_html