-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add TrafilaturaExtractor
class
#431
base: main
Are you sure you want to change the base?
Changes from all commits
632df99
51b1145
db8c5b2
877d6ae
b85cb56
4d447a1
b3fb664
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
import subprocess | ||
import unicodedata | ||
from abc import ABC, abstractmethod | ||
from copy import deepcopy | ||
from typing import Literal, Optional | ||
from urllib.parse import urlparse | ||
|
||
|
@@ -25,6 +26,8 @@ | |
import pycld2 as cld2 | ||
from charset_normalizer import detect | ||
from resiliparse.extract.html2text import extract_plain_text | ||
from trafilatura import extract as extract_with_trafilatura | ||
from trafilatura.settings import DEFAULT_CONFIG as TRAFILATURA_DEFAULT_CONFIG | ||
from warcio.archiveiterator import ArchiveIterator | ||
|
||
from nemo_curator.datasets import DocumentDataset | ||
|
@@ -200,6 +203,98 @@ def extract_text(self, html, stop_words): | |
return result | ||
|
||
|
||
class TrafilaturaExtractor(HTMLExtractorAlgorithm): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a class level docstring explaining what trafilatura is? |
||
def __init__( | ||
self, | ||
required_stopword_density=0.32, | ||
min_extracted_size=250, | ||
min_extracted_comm_size=1, | ||
min_output_size=1, | ||
min_output_comm_size=1, | ||
max_tree_size=None, | ||
min_duplcheck_size=100, | ||
max_repetitions=2, | ||
**extract_kwargs, | ||
): | ||
""" | ||
Initialize the Trafilatura text extraction algorithm with specified parameters. | ||
|
||
Args: | ||
required_stopword_density: Proportion of stopwords required preserve an extracted paragraph. | ||
Studies on stopword lists and their distribution in various text corpora often | ||
suggest that around 30-40% of a typical English text consists of stopwords. | ||
min_extracted_size: Acceptable size in characters (used to trigger fallbacks). | ||
Defaults to 250. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
min_extracted_comm_size: Works the same as min_output_comm_size for comment extraction. | ||
Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
min_output_size: Absolute acceptable minimum for main text output. | ||
Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
min_output_comm_size: Works the same as min_output_comm_size for comment extraction. | ||
Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
max_tree_size: Used to discard documents with too many elements. Defaults to None. | ||
min_duplcheck_size: Minimum size in characters to run deduplication on. | ||
Defaults to 100. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
max_repetitions: Maximum number of duplicates allowed. | ||
Defaults to 2. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. | ||
extract_kwargs: Additional keyword arguments for the Trafilatura extract function. | ||
See API documentation https://trafilatura.readthedocs.io/en/latest/usage-python.html#choice-of-html-elements | ||
for list of possible parameters. | ||
|
||
""" | ||
self.required_stopword_density = required_stopword_density | ||
self.min_extracted_size = min_extracted_size | ||
self.min_extracted_comm_size = min_extracted_comm_size | ||
self.min_output_size = min_output_size | ||
self.min_output_comm_size = min_output_comm_size | ||
self.max_tree_size = max_tree_size | ||
self.min_duplcheck_size = min_duplcheck_size | ||
self.max_repetitions = max_repetitions | ||
self.extract_kwargs = extract_kwargs | ||
|
||
def extract_text(self, html, stop_words): | ||
trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG) | ||
trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str( | ||
self.min_extracted_size | ||
) | ||
trafilatura_config["DEFAULT"]["MIN_EXTRACTED_COMM_SIZE"] = str( | ||
self.min_extracted_comm_size | ||
) | ||
trafilatura_config["DEFAULT"]["MIN_OUTPUT_SIZE"] = str(self.min_output_size) | ||
trafilatura_config["DEFAULT"]["MIN_OUTPUT_COMM_SIZE"] = str( | ||
self.min_output_comm_size | ||
) | ||
if self.max_tree_size: | ||
trafilatura_config["DEFAULT"]["MAX_TREE_SIZE"] = str(self.max_tree_size) | ||
trafilatura_config["DEFAULT"]["MIN_DUPLCHECK_SIZE"] = str( | ||
self.min_duplcheck_size | ||
) | ||
trafilatura_config["DEFAULT"]["MAX_REPETITIONS"] = str(self.max_repetitions) | ||
|
||
text = extract_with_trafilatura( | ||
html, config=trafilatura_config, **self.extract_kwargs | ||
) | ||
|
||
if text is not None: | ||
paragraphs = list(filter(None, text.split("\n"))) | ||
result = [] | ||
for paragraph in paragraphs: | ||
words = paragraph.split() | ||
length = len(words) | ||
if length == 0: | ||
continue | ||
stopwords = [word for word in words if word in stop_words] | ||
stopword_density = len(stopwords) / length | ||
|
||
if stopword_density >= self.required_stopword_density: | ||
result.append(paragraph) | ||
else: | ||
return None | ||
|
||
if len(result) == 0: | ||
return None | ||
return result | ||
|
||
|
||
def get_stop_list_dict(languages=[]): | ||
|
||
# Name mapping for language names from CLD2 (values) | ||
|
@@ -387,7 +482,8 @@ def download_common_crawl( | |
end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot. | ||
output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet". | ||
• This is not used for the output file, but is used to check if an extracted output already exists. | ||
algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing. | ||
algorithm: The text extraction algorithm instance to use for HTML processing. | ||
• This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object. | ||
news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset. | ||
• This also means snapshot identifiers should follow the 'YYYY-MM' format. | ||
aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,11 @@ | |
|
||
import pytest | ||
|
||
from nemo_curator.download import ResiliparseExtractor, download_and_extract | ||
from nemo_curator.download import ( | ||
ResiliparseExtractor, | ||
TrafilaturaExtractor, | ||
download_and_extract, | ||
) | ||
from nemo_curator.download.arxiv import ArxivDownloader, ArxivExtractor, ArxivIterator | ||
from nemo_curator.download.commoncrawl import ( | ||
CommonCrawlWARCDownloader, | ||
|
@@ -54,68 +58,73 @@ def fake_run_success(cmd, stdout, stderr): | |
return FakeCompletedProcess() | ||
|
||
|
||
@pytest.fixture | ||
def html_string(): | ||
# Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py | ||
html = """<!doctype html> | ||
<head> | ||
<title>My Title</title> | ||
<meta charset="utf-8"> | ||
<style>* { margin: 0; }</style> | ||
</head> | ||
<body> | ||
<section id="wrapper"> | ||
<nav> | ||
<ul> | ||
<li>Nav 1</li> | ||
<li> | ||
<p>Nav 2</p> | ||
<ul> | ||
<li><p>Nav 3</p></li> | ||
</ul> | ||
</li> | ||
</ul> | ||
</nav> | ||
<main> | ||
This is a sample paragraph. In it we write words. | ||
These are stopwords: because did than has near we almost while what still. | ||
<a href="#foo" hidden>bar</a> | ||
|
||
<p> | ||
This paragraph doesn't have many stopwords. Remove it. | ||
<br>Let's keep this paragraph: either came does last new took taken making became from. | ||
</p> | ||
|
||
<button aria-hidden="true">Click here</button> | ||
<input type="hidden" value="foo"> | ||
<input type="text" value="Some text" placeholder="Insert text"> | ||
<input type="text" placeholder="Insert text"> | ||
<img src="" alt="Some image"> | ||
<object data="" class="some-class hidden">Cannot display object</object> | ||
</main> | ||
<script language="vbscript" type="text/vbscript">MsgBox("Hello World!")</script> | ||
<noscript>Sorry, your browser doesn't support VB Script!</noscript> | ||
<div><div><div><footer id="global-footer"> | ||
Copyright (C) 2021 Foo Bar | ||
</footer></div></div></div> | ||
</section> | ||
</body> | ||
</html>""" | ||
return html | ||
|
||
|
||
class TestDownload: | ||
def test_imports(self): | ||
from nemo_curator.download import ( | ||
JusTextExtractor, | ||
ResiliparseExtractor, | ||
TrafilaturaExtractor, | ||
download_arxiv, | ||
download_common_crawl, | ||
download_wikipedia, | ||
) | ||
|
||
assert True | ||
|
||
def test_resiliparse_extract_text(self): | ||
# Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py | ||
html = """<!doctype html> | ||
<head> | ||
<title>My Title</title> | ||
<meta charset="utf-8"> | ||
<style>* { margin: 0; }</style> | ||
</head> | ||
<body> | ||
<section id="wrapper"> | ||
<nav> | ||
<ul> | ||
<li>Nav 1</li> | ||
<li> | ||
<p>Nav 2</p> | ||
<ul> | ||
<li><p>Nav 3</p></li> | ||
</ul> | ||
</li> | ||
</ul> | ||
</nav> | ||
<main> | ||
This is a sample paragraph. In it we write words. | ||
These are stopwords: because did than has near we almost while what still. | ||
<a href="#foo" hidden>bar</a> | ||
|
||
<p> | ||
This paragraph doesn't have many stopwords. Remove it. | ||
<br>Let's keep this paragraph: either came does last new took taken making became from. | ||
</p> | ||
|
||
<button aria-hidden="true">Click here</button> | ||
<input type="hidden" value="foo"> | ||
<input type="text" value="Some text" placeholder="Insert text"> | ||
<input type="text" placeholder="Insert text"> | ||
<img src="" alt="Some image"> | ||
<object data="" class="some-class hidden">Cannot display object</object> | ||
</main> | ||
<script language="vbscript" type="text/vbscript">MsgBox("Hello World!")</script> | ||
<noscript>Sorry, your browser doesn't support VB Script!</noscript> | ||
<div><div><div><footer id="global-footer"> | ||
Copyright (C) 2021 Foo Bar | ||
</footer></div></div></div> | ||
</section> | ||
</body> | ||
</html>""" | ||
|
||
def test_resiliparse_extract_text(self, html_string): | ||
algorithm = ResiliparseExtractor() | ||
stop_words = get_stop_list_dict() | ||
result = algorithm.extract_text(html, stop_words["ENGLISH"]) | ||
result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) | ||
|
||
expected = [ | ||
"This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.", | ||
|
@@ -124,6 +133,22 @@ def test_resiliparse_extract_text(self): | |
|
||
assert result == expected | ||
|
||
def test_trafilatura_extract_text(self, html_string): | ||
algorithm = TrafilaturaExtractor( | ||
min_extracted_size=10, | ||
min_duplcheck_size=10, | ||
max_repetitions=1, | ||
deduplicate=True, | ||
) | ||
stop_words = get_stop_list_dict() | ||
result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) | ||
|
||
expected = [ | ||
"Let's keep this paragraph: either came does last new took taken making became from.", | ||
] | ||
Comment on lines
+146
to
+148
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Trafilatura has a really bad bug where it is returning the string twice. I can double check all of my logic, but in the case that this is a Trafilatura-specific issue, I am debating whether I should add our own exact deduplication code into this. (Trafilatura actually has their own Possible related issue: adbar/trafilatura#634 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have not scoped to see what conditions cause this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another issue: adbar/trafilatura#768 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Solved by adding support for these extraction and deduplication parameters: https://trafilatura.readthedocs.io/en/latest/settings.html. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like an issue customers might run into. Have you looked in papers and such for good default values? If Trafilatura's defaults aren't good (or aren't used by most researchers), it could be good to substitute our own. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I can look into it. At the very least, I think setting |
||
|
||
assert result == expected | ||
|
||
def test_incorrect_snapshot_order(self): | ||
with pytest.raises(ValueError): | ||
end_snapshot = "2021-04" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why should the user see the Trafilatura documentation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this was a lingering comment meant to explain what the different Trafilatura-specific parameters mean. I will remove it in favor of the class docstring suggested below.