style: fixed lint errors

huggingface · Dec 21, 2024 · c9f1c2b · c9f1c2b
1 parent 26bf413
commit c9f1c2b
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 6 deletions.
diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
@@ -56,7 +56,7 @@ def get_stoplist(lang: str = "English") -> list[str]:
         from justext import get_stoplist
 
         return get_stoplist(lang)
-    
+
     def clean_html(self, html: str) -> str:
         """
 

diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py
@@ -40,7 +40,7 @@ def clean_html(self, html: str) -> str:
 
         Returns: cleaned HTML
         """
-        from readabilipy import simple_tree_from_html_string    
+        from readabilipy import simple_tree_from_html_string
 
         result = simple_tree_from_html_string(html)
         return str(result)
@@ -53,7 +53,7 @@ def extract(self, text: str) -> str:
 
         Returns: plaintext extracted text
         """
-        from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text
+        from readabilipy.simple_json import extract_text_blocks_as_plain_text, plain_content
 
         cleaned_html = self.clean_html(text)
 

diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py
@@ -1,6 +1,7 @@
-from .base import BaseExtractor
 from datatrove.utils.logging import logger
 
+from .base import BaseExtractor
+
 
 class Resiliparse(BaseExtractor):
     """

diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py
@@ -34,7 +34,7 @@ def __init__(
         self.kwargs = kwargs
         if self.include_images:
             raise NotImplementedError
-        
+
     def clean_html(self, html: str) -> str:
         """
 
@@ -43,9 +43,10 @@ def clean_html(self, html: str) -> str:
 
         Returns: cleaned HTML
         """
-        from trafilatura import bare_extraction
         from xml.etree import ElementTree
 
+        from trafilatura import bare_extraction
+
         html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body']
         cleaned_html = ElementTree.tostring(html_body, encoding = "unicode")
         return cleaned_html
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,7 +56,7 @@ def get_stoplist(lang: str = "English") -> list[str]: @@
             from justext import get_stoplist
             return get_stoplist(lang)
         def clean_html(self, html: str) -> str:
             """
@@ Expand Down @@