Skip to content

Commit

Permalink
improved test case robustness
Browse files Browse the repository at this point in the history
  • Loading branch information
garrethlee committed Dec 22, 2024
1 parent 56a71ed commit cd18c59
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 23 deletions.
1 change: 0 additions & 1 deletion src/datatrove/pipeline/extractors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .inscriptis import Inscriptis
from .justext import Justext
from .modular import ReadabilityInscriptis
from .readabilipy import ReadabiliPy
from .readability import Readability
from .resiliparse import Resiliparse
Expand Down
5 changes: 4 additions & 1 deletion src/datatrove/pipeline/extractors/inscriptis.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def __init__(
self.kwargs = kwargs
self.regex_excessive_lines = re.compile(r"(" + self.new_line_chars + "\n+)")

def clean_html(self, html: str) -> str:
return self.preprocessor.clean_html(html)

def extract(self, text: str) -> str:
"""
Args:
Expand All @@ -50,7 +53,7 @@ def extract(self, text: str) -> str:
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig

cleaned_html = self.preprocessor.clean_html(text)
cleaned_html = self.clean_html(text)

text = get_text(
html_content=cleaned_html,
Expand Down
Loading

0 comments on commit cd18c59

Please sign in to comment.