diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py index 3f92a629..e7e5327a 100644 --- a/nemo_curator/download/arxiv.py +++ b/nemo_curator/download/arxiv.py @@ -18,7 +18,7 @@ import subprocess import tarfile import tempfile -from typing import Optional +from typing import Literal, Optional from nemo_curator.datasets import DocumentDataset from nemo_curator.download.doc_builder import ( @@ -366,7 +366,7 @@ def _build_non_arg_macros_dict(self, file_content): def download_arxiv( output_path: str, - output_type: str = "jsonl", + output_type: Literal["jsonl", "parquet"] = "jsonl", raw_download_dir: Optional[str] = None, keep_raw_download: bool = False, force_download: bool = False, @@ -384,7 +384,7 @@ def download_arxiv( output_path (str): The root directory where both the final extracted files and the raw download subdirectory will be stored. The extracted files (in the format specified by output_type) are eventually saved in this directory. - output_type (str, optional): + output_type (Literal["jsonl", "parquet"], optional): The file format/extension used for saving the extracted documents (e.g., "jsonl" or "parquet"). Default is "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so. raw_download_dir (Optional[str], optional): diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py index a4299946..c651d4d6 100644 --- a/nemo_curator/download/commoncrawl.py +++ b/nemo_curator/download/commoncrawl.py @@ -17,7 +17,7 @@ import subprocess import unicodedata from abc import ABC, abstractmethod -from typing import Optional +from typing import Literal, Optional from urllib.parse import urlparse import justext @@ -363,7 +363,7 @@ def download_common_crawl( output_path: str, start_snapshot: str, end_snapshot: str, - output_type: str = "jsonl", + output_type: Literal["jsonl", "parquet"] = "jsonl", algorithm=JusTextExtractor(), news: bool = False, aws: bool = False, @@ -385,8 +385,8 @@ def download_common_crawl( • For CC-MAIN datasets, use the format 'YYYY-WeekNumber' (e.g., '2020-50' or '2021-04'). • For CC-NEWS datasets (when news=True), use the 'YYYY-MM' (Year-Month) format. end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot. - output_type (str): The file format for the extracted output (e.g., "jsonl"). - • This is not used for the output file, but is used to check if an extracted output already exists and read it if so. + output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet". + • This is not used for the output file, but is used to check if an extracted output already exists. algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing. news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset. • This also means snapshot identifiers should follow the 'YYYY-MM' format. diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py index 33fe1a95..9d3e849b 100644 --- a/nemo_curator/download/doc_builder.py +++ b/nemo_curator/download/doc_builder.py @@ -15,7 +15,7 @@ import importlib import os from abc import ABC, abstractmethod -from typing import List, Optional, Tuple, Union +from typing import List, Literal, Optional, Tuple, Union import dask.dataframe as dd import pandas as pd @@ -105,7 +105,7 @@ def _download_and_extract_single_partition( downloader: DocumentDownloader, iterator: DocumentIterator, extractor: DocumentExtractor, - output_type: str, + output_type: Literal["jsonl", "parquet"], keep_raw_download: bool, force_download: bool, input_meta: Union[str, dict] = None, @@ -123,10 +123,10 @@ def _download_and_extract_single_partition( downloader (DocumentDownloader): An object to download the content from the URL. iterator (DocumentIterator): An object to iterate over records in the downloaded file. extractor (DocumentExtractor): An object to extract the desired content from each record. - output_type (str): A string specifying the output file type (e.g., "jsonl"). + output_type (Literal["jsonl", "parquet"]): The output file format/extension. Must be either "jsonl" or "parquet". Defaults to "jsonl". This parameter is only used to verify whether an extracted output already exists. keep_raw_download (bool): If False, deletes the raw download file after extraction. force_download (bool): If False and output_path exists, skips downloading and extraction. - input_meta (Union[str, dict], optional): Metadata describing the input file’s structure. + input_meta (Union[str, dict], optional): Metadata describing the input file's structure. filename_col (str, optional): Name of the column to store the filename within the result DataFrame. record_limit (int, optional): Limit the number of records to extract from each file. Returns: @@ -176,7 +176,7 @@ def download_and_extract( iterator: DocumentIterator, extractor: DocumentExtractor, output_format: dict, - output_type: str = "jsonl", + output_type: Literal["jsonl", "parquet"] = "jsonl", keep_raw_download: bool = False, force_download: bool = False, input_meta: Union[str, dict] = None, @@ -211,8 +211,8 @@ def download_and_extract( output_format (dict): A dictionary mapping column names to the data types for the extracted records. - output_type (str, optional): - The output file format/extension (e.g., "jsonl" or "parquet"). + output_type (Literal["jsonl", "parquet"], optional): + The output file format/extension. Must be either "jsonl" or "parquet". Defaults to "jsonl". This parameter is only used to verify whether an extracted output already exists. keep_raw_download (bool, optional): diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py index 4ef9c7e1..e494ed38 100644 --- a/nemo_curator/download/wikipedia.py +++ b/nemo_curator/download/wikipedia.py @@ -18,7 +18,7 @@ import re import subprocess import xml.etree.cElementTree as etree -from typing import Optional +from typing import Literal, Optional from urllib.parse import quote, urlparse import mwparserfromhell @@ -750,7 +750,7 @@ def download_wikipedia( output_path: str, language: str = "en", dump_date: Optional[str] = None, - output_type: str = "jsonl", + output_type: Literal["jsonl", "parquet"] = "jsonl", raw_download_dir: Optional[str] = None, keep_raw_download: bool = False, force_download: bool = False, @@ -771,7 +771,7 @@ def download_wikipedia( language (str, optional): The language code for the Wikipedia dump to download. Default is "en". dump_date (Optional[str], optional): The dump date in "YYYYMMDD" format. If None, the latest available dump is downloaded. - output_type (str, optional): The file format/extension for saving the extracted documents (e.g., "jsonl"). + output_type (Literal["jsonl", "parquet"], optional): The file format/extension for saving the extracted documents (e.g., "jsonl"). Defaults to "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so. raw_download_dir (Optional[str], optional): Directory used for temporary storage of raw bz2 dump files.