Fix issues with download and extract (#541)

* Make text field optional in extractor and add tests Signed-off-by: Ryan Wolf <[email protected]> * Update documentation on common crawl download Signed-off-by: Ryan Wolf <[email protected]> * Add record limit and adjust cli args Signed-off-by: Ryan Wolf <[email protected]> * Skip flakey tests Signed-off-by: Ryan Wolf <[email protected]> * Update docs Signed-off-by: Ryan Wolf <[email protected]> * Clarify typing on output types Signed-off-by: Ryan Wolf <[email protected]> * Remove skipped tests Signed-off-by: Ryan Wolf <[email protected]> --------- Signed-off-by: Ryan Wolf <[email protected]>
NVIDIA · Feb 18, 2025 · 908e0f1 · 908e0f1
1 parent 7042c2c
commit 908e0f1
Show file tree

Hide file tree

Showing 7 changed files with 656 additions and 180 deletions.
diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst
@@ -37,10 +37,42 @@ By "extraction", we typically mean the process of converting a data format from
   Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it:
 
   .. code-block:: python
-
+    import os
+    from nemo_curator import get_client
     from nemo_curator.download import download_common_crawl
-
-    common_crawl = download_common_crawl("/extracted/output/folder", "2020-50", "2021-04", output_type="jsonl")
+    from nemo_curator.datasets import DocumentDataset
+
+    def main():
+        # Initialize a distributed Dask client
+        client = get_client(cluster_type="cpu")
+
+        # Parameters for downloading Common Crawl data.
+        # - output_folder: directory for temporary download/extraction files
+        # - start_snapshot and end_snapshot define the range to fetch
+        # - output_type: specifies file format for the extracted data (e.g., "jsonl")
+        output_folder = "/extracted/output/folder"
+        start_snapshot = "2020-50"
+        end_snapshot = "2021-04"
+        output_type = "jsonl"
+        os.makedirs(output_folder, exist_ok=True)
+
+        # Download and extract the Common Crawl data.
+        # The function returns a DocumentDataset that contains the extracted documents.
+        # Note: The output folder and output type are passed here to store intermediate files
+        # and check if the data has already been downloaded. They should match the final location
+        # and format of the extracted data.
+        common_crawl_dataset = download_common_crawl(
+            output_folder, start_snapshot, end_snapshot, output_type=output_type
+        )
+
+        # Write the extracted dataset to JSON format.
+        # The 'to_json' method will write one JSON document per line,
+        # preserving the original shard information if write_to_filename is True.
+        common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
+        print("Extracted dataset saved to:", output_folder)
+
+    if __name__ == "__main__":
+        main()
 
   * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed.
   * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here <https://data.commoncrawl.org/>`_.
@@ -50,21 +82,49 @@ By "extraction", we typically mean the process of converting a data format from
 You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below.
 
   .. code-block:: python
-
+    import os
+    from nemo_curator import get_client
     from nemo_curator.download import (
-      ResiliparseExtractor,
-      download_common_crawl,
-    )
-
-    # Change the extraction algorithm
-    extraction_algorithm = ResiliparseExtractor()
-    common_crawl = download_common_crawl(
-      "/extracted/output/folder",
-      "2020-50",
-      "2021-04",
-      output_type="jsonl",
-      algorithm=extraction_algorithm,
+        ResiliparseExtractor,
+        download_common_crawl,
     )
+    from nemo_curator.datasets import DocumentDataset
+
+    def main():
+        # Initialize a distributed Dask client
+        client = get_client(cluster_type="cpu")
+
+        # Parameters for downloading Common Crawl data.
+        # - output_folder: directory for temporary download/extraction files
+        # - start_snapshot and end_snapshot define the range to fetch
+        # - output_type: specifies file format for the extracted data (e.g., "jsonl")
+        output_folder = "/extracted/output/folder"
+        start_snapshot = "2020-50"
+        end_snapshot = "2021-04"
+        output_type = "jsonl"
+        os.makedirs(output_folder, exist_ok=True)
+
+        # Change the extraction algorithm to use ResiliparseExtractor
+        extraction_algorithm = ResiliparseExtractor()
+
+        # Download and extract the Common Crawl data using the Resiliparse extraction algorithm.
+        # The function returns a DocumentDataset that contains the extracted documents.
+        common_crawl_dataset = download_common_crawl(
+            output_folder,
+            start_snapshot,
+            end_snapshot,
+            output_type=output_type,
+            algorithm=extraction_algorithm,
+        )
+
+        # Write the extracted dataset to JSON format.
+        # The 'to_json' method writes one JSON document per line,
+        # preserving the original shard information if write_to_filename is True.
+        common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True)
+        print("Extracted dataset saved to:", output_folder)
+
+    if __name__ == "__main__":
+        main()
 
   Above, we changed the extraction algorithm from the default ``JusTextExtractor``.
 

diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py
@@ -18,6 +18,7 @@
 import subprocess
 import tarfile
 import tempfile
+from typing import Literal, Optional
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.download.doc_builder import (
@@ -218,12 +219,12 @@ def extract(self, content):
                 for file_content in content
             )
         except Exception:
-            return {}, None
+            return None
 
         # Don't return meta
         if cleaned_latex_file_str is not None:
             if len(cleaned_latex_file_str) > 0:
-                return {}, cleaned_latex_file_str
+                return {"text": cleaned_latex_file_str}
 
     def _clean_tex_file(self, file_content, arg_macros, non_arg_macros):
         r"""function takes a tex file as input and returns a cleaned version. The
@@ -365,25 +366,44 @@ def _build_non_arg_macros_dict(self, file_content):
 
 def download_arxiv(
     output_path: str,
-    output_type: str = "jsonl",
-    raw_download_dir=None,
-    keep_raw_download=False,
-    force_download=False,
-    url_limit=None,
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
+    raw_download_dir: Optional[str] = None,
+    keep_raw_download: bool = False,
+    force_download: bool = False,
+    url_limit: Optional[int] = None,
+    record_limit: Optional[int] = None,
 ) -> DocumentDataset:
     """
-    Downloads Arxiv tar files and extracts them
+    Download Arxiv tar files and extract the contained LaTeX projects.
+
+    This function obtains a list of Arxiv tar file URLs (via get_arxiv_urls), downloads the tar files,
+    and then extracts the contained LaTeX source files. The resulting documents (after extraction) are
+    assembled into a DocumentDataset.
 
     Args:
-      output_path: The path to the root directory of the files
-      output_type: The file type to save the data as.
-      raw_download_dir: Path to store the raw download files for intermediate processing.
-        If None, they are stored in a folder named "downloads" under output_path.
-      keep_raw_download: If True, keeps the compressed WARC files that have not been extracted.
-      force_download: If False, will skip processing all files in output_paths that already exist and
-        directly read from them instead.
-      url_limit: The maximum number of raw files to download from the snapshot. If None, all
-        files from the range of snapshots are downloaded.
+        output_path (str):
+            The root directory where both the final extracted files and the raw download subdirectory will be stored.
+            The extracted files (in the format specified by output_type) are eventually saved in this directory.
+        output_type (Literal["jsonl", "parquet"], optional):
+            The file format/extension used for saving the extracted documents (e.g., "jsonl" or "parquet").
+            Default is "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
+        raw_download_dir (Optional[str], optional):
+            The directory where the raw downloaded tar files will be kept. If None, a folder named "downloads"
+            under output_path is used.
+        keep_raw_download (bool, optional):
+            If True, the raw tar files (before extraction) are not removed after processing. Default is False.
+        force_download (bool, optional):
+            If False, then if an output file already exists for a given URL, re-downloading and re-extraction will be skipped.
+            Default is False.
+        url_limit (Optional[int], optional):
+            Limits the maximum number of Arxiv tar file URLs to download and process.
+            If None, all available URLs (from get_arxiv_urls) are processed.
+        record_limit (Optional[int], optional):
+            Limits the maximum number of records to extract from each tar file.
+            If None, all available records are extracted.
+    Returns:
+        DocumentDataset:
+            A dataset object containing the extracted documents.
     """
     arxiv_urls = get_arxiv_urls()
     if url_limit:
@@ -416,6 +436,7 @@ def download_arxiv(
         keep_raw_download=keep_raw_download,
         force_download=force_download,
         filename_col="file_name",
+        record_limit=record_limit,
     )
 
     return dataset
diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
@@ -17,6 +17,7 @@
 import subprocess
 import unicodedata
 from abc import ABC, abstractmethod
+from typing import Literal, Optional
 from urllib.parse import urlparse
 
 import justext
@@ -352,48 +353,54 @@ def extract(self, content):
             if text is not None:
                 if len(text) > 0:
                     text = "\n\n".join(text)
-                    meta = {"language": lang}
-                    return meta, text
+                    meta = {"language": lang, "text": text}
+                    return meta
                 else:
-                    return None, None
+                    return None
 
 
 def download_common_crawl(
     output_path: str,
     start_snapshot: str,
     end_snapshot: str,
-    output_type: str = "jsonl",
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
     algorithm=JusTextExtractor(),
-    news=False,
-    aws=False,
-    raw_download_dir=None,
-    keep_raw_download=False,
-    force_download=False,
-    url_limit=None,
+    news: bool = False,
+    aws: bool = False,
+    raw_download_dir: Optional[str] = None,
+    keep_raw_download: bool = False,
+    force_download: bool = False,
+    url_limit: Optional[int] = None,
+    record_limit: Optional[int] = None,
 ) -> DocumentDataset:
     """
-    Downloads Common Crawl WARC snapshots and extracts them using jusText or Resiliparse
+    Downloads Common Crawl WARC snapshots and extracts text content using a specified extraction algorithm.
 
     Args:
-      output_path: The path to the root directory of the files
-      start_snapshot: The first common crawl snapshot to include. Snapshots must be
-        specified by YYYY-WeekNumber (e.g., '2020-50' or '2021-04'). For the CC-NEWS dataset,
-        (specified with news=True flag) this changes to Year-Month (YYYY-MM).
-      end_snapshot: The last common crawl snapshot to include. Must be chronologically
-        after the starting snapshot.
-      output_type: The file type to save the data as.
-      algorithm: A JusTextExtractor or ResiliparseExtractor object.
-      news: If True, gets WARC URLs for the CC-NEWS dataset instead of the CC-MAIN datasets.
-        Also assumes that the format for the start and end snapshots is 'YYYY-MM' (Year-Month).
-      aws: Whether to download from Common Crawl's S3 bucket. If True, uses s5cmd to download.
-        If False, uses wget.
-      raw_download_dir: Path to store the raw download files for intermediate processing.
-        If None, they are stored in a folder named "downloads" under output_path.
-      keep_raw_download: If True, keeps the compressed WARC files that have not been extracted.
-      force_download: If False, will skip processing all files in output_paths that already exist and
-        directly read from them instead.
-      url_limit: The maximum number of raw files to download from the snapshot. If None, all
-        files from the range of snapshots are downloaded.
+      output_path (str): The root directory used for managing download and extraction.
+          • Raw WARC files are stored in a "downloads" subdirectory under this path.
+          • This path is also checked for existing extraction results; if found, extraction can be skipped.
+          • Note: This function returns a DocumentDataset, and writing the extracted data to disk is the caller's responsibility.
+      start_snapshot (str): Identifier for the earliest snapshot to process.
+          • For CC-MAIN datasets, use the format 'YYYY-WeekNumber' (e.g., '2020-50' or '2021-04').
+          • For CC-NEWS datasets (when news=True), use the 'YYYY-MM' (Year-Month) format.
+      end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot.
+      output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet".
+          • This is not used for the output file, but is used to check if an extracted output already exists.
+      algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing.
+      news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
+          • This also means snapshot identifiers should follow the 'YYYY-MM' format.
+      aws (bool): If True, downloads are sourced from Common Crawl's S3 bucket using s5cmd;
+          • If False, wget is used to fetch the files via HTTPS.
+      raw_download_dir: Optional; the directory to temporarily store raw WARC files.
+          • If not provided, defaults to a "downloads" folder within output_path.
+      keep_raw_download (bool): If True, retains the downloaded raw WARC files after extraction.
+          • If False, these raw files may be removed following extraction.
+      force_download (bool): If False, skips re-downloading or re-extracting snapshots if outputs already exist in output_path.
+      url_limit: Optional; the maximum number of WARC files to download from the snapshot range.
+          • If None, all available files within the specified snapshots are downloaded.
+      record_limit: Optional; the maximum number of records to extract from each WARC file.
+          • If None, all available records are extracted.
     """
     common_crawl_urls = get_common_crawl_urls(
         starting_snapshot=start_snapshot, ending_snapshot=end_snapshot, news=news
@@ -443,6 +450,7 @@ def download_common_crawl(
         keep_raw_download=keep_raw_download,
         force_download=force_download,
         filename_col="file_name",
+        record_limit=record_limit,
     )
 
     return dataset