From c9f1c2b7bf6ecbf90012591a9ffe45090aed7fa3 Mon Sep 17 00:00:00 2001
From: garrethlee <garreth.lee@huggingface.co>
Date: Sat, 21 Dec 2024 23:35:09 +0000
Subject: [PATCH] style: fixed lint errors

---
 src/datatrove/pipeline/extractors/justext.py     | 2 +-
 src/datatrove/pipeline/extractors/readabilipy.py | 4 ++--
 src/datatrove/pipeline/extractors/resiliparse.py | 3 ++-
 src/datatrove/pipeline/extractors/trafilatura.py | 5 +++--
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/datatrove/pipeline/extractors/justext.py b/src/datatrove/pipeline/extractors/justext.py
index 05e71914..43e0116a 100644
--- a/src/datatrove/pipeline/extractors/justext.py
+++ b/src/datatrove/pipeline/extractors/justext.py
@@ -56,7 +56,7 @@ def get_stoplist(lang: str = "English") -> list[str]:
         from justext import get_stoplist
 
         return get_stoplist(lang)
-    
+
     def clean_html(self, html: str) -> str:
         """
 
diff --git a/src/datatrove/pipeline/extractors/readabilipy.py b/src/datatrove/pipeline/extractors/readabilipy.py
index c5e345b7..8a0f32df 100644
--- a/src/datatrove/pipeline/extractors/readabilipy.py
+++ b/src/datatrove/pipeline/extractors/readabilipy.py
@@ -40,7 +40,7 @@ def clean_html(self, html: str) -> str:
 
         Returns: cleaned HTML
         """
-        from readabilipy import simple_tree_from_html_string    
+        from readabilipy import simple_tree_from_html_string
 
         result = simple_tree_from_html_string(html)
         return str(result)
@@ -53,7 +53,7 @@ def extract(self, text: str) -> str:
 
         Returns: plaintext extracted text
         """
-        from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text
+        from readabilipy.simple_json import extract_text_blocks_as_plain_text, plain_content
 
         cleaned_html = self.clean_html(text)
 
diff --git a/src/datatrove/pipeline/extractors/resiliparse.py b/src/datatrove/pipeline/extractors/resiliparse.py
index 574d44de..008fe3d0 100644
--- a/src/datatrove/pipeline/extractors/resiliparse.py
+++ b/src/datatrove/pipeline/extractors/resiliparse.py
@@ -1,6 +1,7 @@
-from .base import BaseExtractor
 from datatrove.utils.logging import logger
 
+from .base import BaseExtractor
+
 
 class Resiliparse(BaseExtractor):
     """
diff --git a/src/datatrove/pipeline/extractors/trafilatura.py b/src/datatrove/pipeline/extractors/trafilatura.py
index d02cecd1..12cf8efd 100644
--- a/src/datatrove/pipeline/extractors/trafilatura.py
+++ b/src/datatrove/pipeline/extractors/trafilatura.py
@@ -34,7 +34,7 @@ def __init__(
         self.kwargs = kwargs
         if self.include_images:
             raise NotImplementedError
-        
+
     def clean_html(self, html: str) -> str:
         """
 
@@ -43,9 +43,10 @@ def clean_html(self, html: str) -> str:
 
         Returns: cleaned HTML
         """
-        from trafilatura import bare_extraction
         from xml.etree import ElementTree
 
+        from trafilatura import bare_extraction
+
         html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body']
         cleaned_html = ElementTree.tostring(html_body, encoding = "unicode")
         return cleaned_html