diff --git a/docs/user-guide/api/download.rst b/docs/user-guide/api/download.rst index e4182a587..56c0a90c1 100644 --- a/docs/user-guide/api/download.rst +++ b/docs/user-guide/api/download.rst @@ -55,6 +55,9 @@ Common Crawl .. autoclass:: nemo_curator.download.ResiliparseExtractor :members: +.. autoclass:: nemo_curator.download.TrafilaturaExtractor + :members: + ------------------------------ Wikipedia ------------------------------ diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index 72af8e612..c05d9e6f4 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -18,7 +18,7 @@ the extraction step to limit the amount of documents that undergo this heavy com NeMo Curator provides example utilities for downloading and extracting Common Crawl, ArXiv, and Wikipedia data. In addition, it provides a flexible interface to extend the utility to other datasets. Our Common Crawl example demonstrates how to process a crawl by downloading the data from S3, doing preliminary language filtering with pyCLD2, -and extracting the relevant text with jusText or Resiliparse to output :code:`.jsonl` files. +and extracting the relevant text with jusText, Resiliparse, or Trafilatura to output :code:`.jsonl` files. NeMo Curator currently does not provide out-of-the-box support for web-crawling or web-scraping. It provides utilities for downloading and extracting data from the preexisting online sources given above. @@ -88,6 +88,7 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c from nemo_curator import get_client from nemo_curator.download import ( ResiliparseExtractor, + TrafilaturaExtractor, download_common_crawl, ) from nemo_curator.datasets import DocumentDataset @@ -106,8 +107,10 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c output_type = "jsonl" os.makedirs(output_folder, exist_ok=True) - # Change the extraction algorithm to use ResiliparseExtractor + # Change the extraction algorithm to Resiliparse extraction_algorithm = ResiliparseExtractor() + # Alternatively, change the extraction algorithm to Trafilatura + # extraction_algorithm = TrafilaturaExtractor() # Download and extract the Common Crawl data using the Resiliparse extraction algorithm. # The function returns a DocumentDataset that contains the extracted documents. @@ -128,7 +131,7 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c if __name__ == "__main__": main() -Above, we changed the extraction algorithm from the default ``JusTextExtractor``. +Above, we changed the extraction algorithm from the default ``JusTextExtractor``. **Note:** Please see the Trafilatura documentation `here `_ The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. @@ -136,7 +139,7 @@ NeMo Curator's Common Crawl extraction process looks like this under the hood: 1. Decode the HTML within the record from binary to text. 2. If the HTML can be properly decoded, then with `pyCLD2 `_, perform language detection on the input HTML. - 3. Finally, the extract the relevant text with `jusText `_ or `Resiliparse `_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file. + 3. Finally, the extract the relevant text with `jusText `_, `Resiliparse `_, or `Trafilatura `_ from the HTML and write it out as a single string within the 'text' field of a json entry within a `.jsonl` file. * ``download_wikipedia`` will download and extract the latest wikipedia dump. Files are downloaded using ``wget``. Wikipedia might download slower than the other datasets. This is because they limit the number of downloads that can occur per-ip address. .. code-block:: python diff --git a/nemo_curator/download/__init__.py b/nemo_curator/download/__init__.py index cfa1811e4..1de9a080e 100644 --- a/nemo_curator/download/__init__.py +++ b/nemo_curator/download/__init__.py @@ -20,6 +20,7 @@ CommonCrawlWARCIterator, JusTextExtractor, ResiliparseExtractor, + TrafilaturaExtractor, download_common_crawl, ) from .doc_builder import ( @@ -54,6 +55,7 @@ "CommonCrawlWARCDownloaderExtractOnly", "JusTextExtractor", "ResiliparseExtractor", + "TrafilaturaExtractor", "download_wikipedia", "WikipediaDownloader", "WikipediaIterator", diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index c651d4d61..6e03628a9 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -17,6 +17,7 @@ import subprocess import unicodedata from abc import ABC, abstractmethod +from copy import deepcopy from typing import Literal, Optional from urllib.parse import urlparse @@ -25,6 +26,8 @@ import pycld2 as cld2 from charset_normalizer import detect from resiliparse.extract.html2text import extract_plain_text +from trafilatura import extract as extract_with_trafilatura +from trafilatura.settings import DEFAULT_CONFIG as TRAFILATURA_DEFAULT_CONFIG from warcio.archiveiterator import ArchiveIterator from nemo_curator.datasets import DocumentDataset @@ -200,6 +203,98 @@ def extract_text(self, html, stop_words): return result +class TrafilaturaExtractor(HTMLExtractorAlgorithm): + def __init__( + self, + required_stopword_density=0.32, + min_extracted_size=250, + min_extracted_comm_size=1, + min_output_size=1, + min_output_comm_size=1, + max_tree_size=None, + min_duplcheck_size=100, + max_repetitions=2, + **extract_kwargs, + ): + """ + Initialize the Trafilatura text extraction algorithm with specified parameters. + + Args: + required_stopword_density: Proportion of stopwords required preserve an extracted paragraph. + Studies on stopword lists and their distribution in various text corpora often + suggest that around 30-40% of a typical English text consists of stopwords. + min_extracted_size: Acceptable size in characters (used to trigger fallbacks). + Defaults to 250. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + min_extracted_comm_size: Works the same as min_output_comm_size for comment extraction. + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + min_output_size: Absolute acceptable minimum for main text output. + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + min_output_comm_size: Works the same as min_output_comm_size for comment extraction. + Defaults to 1. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + max_tree_size: Used to discard documents with too many elements. Defaults to None. + min_duplcheck_size: Minimum size in characters to run deduplication on. + Defaults to 100. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + max_repetitions: Maximum number of duplicates allowed. + Defaults to 2. See Trafilatura documentation: https://trafilatura.readthedocs.io/en/latest/settings.html. + extract_kwargs: Additional keyword arguments for the Trafilatura extract function. + See API documentation https://trafilatura.readthedocs.io/en/latest/usage-python.html#choice-of-html-elements + for list of possible parameters. + + """ + self.required_stopword_density = required_stopword_density + self.min_extracted_size = min_extracted_size + self.min_extracted_comm_size = min_extracted_comm_size + self.min_output_size = min_output_size + self.min_output_comm_size = min_output_comm_size + self.max_tree_size = max_tree_size + self.min_duplcheck_size = min_duplcheck_size + self.max_repetitions = max_repetitions + self.extract_kwargs = extract_kwargs + + def extract_text(self, html, stop_words): + trafilatura_config = deepcopy(TRAFILATURA_DEFAULT_CONFIG) + trafilatura_config["DEFAULT"]["MIN_EXTRACTED_SIZE"] = str( + self.min_extracted_size + ) + trafilatura_config["DEFAULT"]["MIN_EXTRACTED_COMM_SIZE"] = str( + self.min_extracted_comm_size + ) + trafilatura_config["DEFAULT"]["MIN_OUTPUT_SIZE"] = str(self.min_output_size) + trafilatura_config["DEFAULT"]["MIN_OUTPUT_COMM_SIZE"] = str( + self.min_output_comm_size + ) + if self.max_tree_size: + trafilatura_config["DEFAULT"]["MAX_TREE_SIZE"] = str(self.max_tree_size) + trafilatura_config["DEFAULT"]["MIN_DUPLCHECK_SIZE"] = str( + self.min_duplcheck_size + ) + trafilatura_config["DEFAULT"]["MAX_REPETITIONS"] = str(self.max_repetitions) + + text = extract_with_trafilatura( + html, config=trafilatura_config, **self.extract_kwargs + ) + + if text is not None: + paragraphs = list(filter(None, text.split("\n"))) + result = [] + for paragraph in paragraphs: + words = paragraph.split() + length = len(words) + if length == 0: + continue + stopwords = [word for word in words if word in stop_words] + stopword_density = len(stopwords) / length + + if stopword_density >= self.required_stopword_density: + result.append(paragraph) + else: + return None + + if len(result) == 0: + return None + return result + + def get_stop_list_dict(languages=[]): # Name mapping for language names from CLD2 (values) @@ -387,7 +482,8 @@ def download_common_crawl( end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot. output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet". • This is not used for the output file, but is used to check if an extracted output already exists. - algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing. + algorithm: The text extraction algorithm instance to use for HTML processing. + • This can be a JusTextExtractor (default), ResiliparseExtractor, or TrafilaturaExtractor object. news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset. • This also means snapshot identifiers should follow the 'YYYY-MM' format. aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd; diff --git a/pyproject.toml b/pyproject.toml index 14d368acb..77f90b633 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,7 @@ dependencies = [ "resiliparse", "sentencepiece", "spacy>=3.6.0, <3.8.0", + "trafilatura", "transformers>=4.48.0", "unidic-lite==1.0.8", "usaddress==0.5.10", diff --git a/tests/test_download.py b/tests/test_download.py index b3389e92f..51eb4e312 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -19,7 +19,11 @@ import pytest -from nemo_curator.download import ResiliparseExtractor, download_and_extract +from nemo_curator.download import ( + ResiliparseExtractor, + TrafilaturaExtractor, + download_and_extract, +) from nemo_curator.download.arxiv import ArxivDownloader, ArxivExtractor, ArxivIterator from nemo_curator.download.commoncrawl import ( CommonCrawlWARCDownloader, @@ -54,11 +58,62 @@ def fake_run_success(cmd, stdout, stderr): return FakeCompletedProcess() +@pytest.fixture +def html_string(): + # Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py + html = """ + + My Title + + + + +
+ +
+ This is a sample paragraph. In it we write words. + These are stopwords: because did than has near we almost while what still. + + +

+ This paragraph doesn't have many stopwords. Remove it. +
Let's keep this paragraph: either came does last new took taken making became from. +

+ + + + + + Some image + +
+ + +
+ Copyright (C) 2021 Foo Bar +
+
+ + """ + return html + + class TestDownload: def test_imports(self): from nemo_curator.download import ( JusTextExtractor, ResiliparseExtractor, + TrafilaturaExtractor, download_arxiv, download_common_crawl, download_wikipedia, @@ -66,56 +121,10 @@ def test_imports(self): assert True - def test_resiliparse_extract_text(self): - # Modified from https://github.com/chatnoir-eu/chatnoir-resiliparse/blob/abdf1966fb3cefe3e0790e510ab5cb1446f99a79/tests/resiliparse/extract/test_html2text.py - html = """ - - My Title - - - - -
- -
- This is a sample paragraph. In it we write words. - These are stopwords: because did than has near we almost while what still. - - -

- This paragraph doesn't have many stopwords. Remove it. -
Let's keep this paragraph: either came does last new took taken making became from. -

- - - - - - Some image - -
- - -
-
- - """ - + def test_resiliparse_extract_text(self, html_string): algorithm = ResiliparseExtractor() stop_words = get_stop_list_dict() - result = algorithm.extract_text(html, stop_words["ENGLISH"]) + result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) expected = [ "This is a sample paragraph. In it we write words. These are stopwords: because did than has near we almost while what still.", @@ -124,6 +133,22 @@ def test_resiliparse_extract_text(self): assert result == expected + def test_trafilatura_extract_text(self, html_string): + algorithm = TrafilaturaExtractor( + min_extracted_size=10, + min_duplcheck_size=10, + max_repetitions=1, + deduplicate=True, + ) + stop_words = get_stop_list_dict() + result = algorithm.extract_text(html_string, stop_words["ENGLISH"]) + + expected = [ + "Let's keep this paragraph: either came does last new took taken making became from.", + ] + + assert result == expected + def test_incorrect_snapshot_order(self): with pytest.raises(ValueError): end_snapshot = "2021-04"