Skip to content

Commit

Permalink
style: fixed lint errors
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Dec 21, 2024
1 parent 26bf413 commit c9f1c2b
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 6 deletions.
2 changes: 1 addition & 1 deletion src/datatrove/pipeline/extractors/justext.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_stoplist(lang: str = "English") -> list[str]:
from justext import get_stoplist

return get_stoplist(lang)

def clean_html(self, html: str) -> str:
"""
Expand Down
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/extractors/readabilipy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def clean_html(self, html: str) -> str:
Returns: cleaned HTML
"""
from readabilipy import simple_tree_from_html_string
from readabilipy import simple_tree_from_html_string

result = simple_tree_from_html_string(html)
return str(result)
Expand All @@ -53,7 +53,7 @@ def extract(self, text: str) -> str:
Returns: plaintext extracted text
"""
from readabilipy.simple_json import plain_content, extract_text_blocks_as_plain_text
from readabilipy.simple_json import extract_text_blocks_as_plain_text, plain_content

cleaned_html = self.clean_html(text)

Expand Down
3 changes: 2 additions & 1 deletion src/datatrove/pipeline/extractors/resiliparse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .base import BaseExtractor
from datatrove.utils.logging import logger

from .base import BaseExtractor


class Resiliparse(BaseExtractor):
"""
Expand Down
5 changes: 3 additions & 2 deletions src/datatrove/pipeline/extractors/trafilatura.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
self.kwargs = kwargs
if self.include_images:
raise NotImplementedError

def clean_html(self, html: str) -> str:
"""
Expand All @@ -43,9 +43,10 @@ def clean_html(self, html: str) -> str:
Returns: cleaned HTML
"""
from trafilatura import bare_extraction
from xml.etree import ElementTree

from trafilatura import bare_extraction

html_body = bare_extraction(html, favor_precision=self.favour_precision, **self.kwargs)['body']
cleaned_html = ElementTree.tostring(html_body, encoding = "unicode")
return cleaned_html
Expand Down

0 comments on commit c9f1c2b

Please sign in to comment.