Skip to content

Commit

Permalink
Clarify typing on output types
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Wolf <[email protected]>
  • Loading branch information
ryantwolf committed Feb 13, 2025
1 parent 8ef224c commit 2bfa3b6
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 17 deletions.
6 changes: 3 additions & 3 deletions nemo_curator/download/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import subprocess
import tarfile
import tempfile
from typing import Optional
from typing import Literal, Optional

from nemo_curator.datasets import DocumentDataset
from nemo_curator.download.doc_builder import (
Expand Down Expand Up @@ -366,7 +366,7 @@ def _build_non_arg_macros_dict(self, file_content):

def download_arxiv(
output_path: str,
output_type: str = "jsonl",
output_type: Literal["jsonl", "parquet"] = "jsonl",
raw_download_dir: Optional[str] = None,
keep_raw_download: bool = False,
force_download: bool = False,
Expand All @@ -384,7 +384,7 @@ def download_arxiv(
output_path (str):
The root directory where both the final extracted files and the raw download subdirectory will be stored.
The extracted files (in the format specified by output_type) are eventually saved in this directory.
output_type (str, optional):
output_type (Literal["jsonl", "parquet"], optional):
The file format/extension used for saving the extracted documents (e.g., "jsonl" or "parquet").
Default is "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
raw_download_dir (Optional[str], optional):
Expand Down
8 changes: 4 additions & 4 deletions nemo_curator/download/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import subprocess
import unicodedata
from abc import ABC, abstractmethod
from typing import Optional
from typing import Literal, Optional
from urllib.parse import urlparse

import justext
Expand Down Expand Up @@ -363,7 +363,7 @@ def download_common_crawl(
output_path: str,
start_snapshot: str,
end_snapshot: str,
output_type: str = "jsonl",
output_type: Literal["jsonl", "parquet"] = "jsonl",
algorithm=JusTextExtractor(),
news: bool = False,
aws: bool = False,
Expand All @@ -385,8 +385,8 @@ def download_common_crawl(
• For CC-MAIN datasets, use the format 'YYYY-WeekNumber' (e.g., '2020-50' or '2021-04').
• For CC-NEWS datasets (when news=True), use the 'YYYY-MM' (Year-Month) format.
end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot.
output_type (str): The file format for the extracted output (e.g., "jsonl").
• This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet".
• This is not used for the output file, but is used to check if an extracted output already exists.
algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing.
news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
• This also means snapshot identifiers should follow the 'YYYY-MM' format.
Expand Down
14 changes: 7 additions & 7 deletions nemo_curator/download/doc_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import importlib
import os
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, Union
from typing import List, Literal, Optional, Tuple, Union

import dask.dataframe as dd
import pandas as pd
Expand Down Expand Up @@ -105,7 +105,7 @@ def _download_and_extract_single_partition(
downloader: DocumentDownloader,
iterator: DocumentIterator,
extractor: DocumentExtractor,
output_type: str,
output_type: Literal["jsonl", "parquet"],
keep_raw_download: bool,
force_download: bool,
input_meta: Union[str, dict] = None,
Expand All @@ -123,10 +123,10 @@ def _download_and_extract_single_partition(
downloader (DocumentDownloader): An object to download the content from the URL.
iterator (DocumentIterator): An object to iterate over records in the downloaded file.
extractor (DocumentExtractor): An object to extract the desired content from each record.
output_type (str): A string specifying the output file type (e.g., "jsonl").
output_type (Literal["jsonl", "parquet"]): The output file format/extension. Must be either "jsonl" or "parquet". Defaults to "jsonl". This parameter is only used to verify whether an extracted output already exists.
keep_raw_download (bool): If False, deletes the raw download file after extraction.
force_download (bool): If False and output_path exists, skips downloading and extraction.
input_meta (Union[str, dict], optional): Metadata describing the input files structure.
input_meta (Union[str, dict], optional): Metadata describing the input file's structure.
filename_col (str, optional): Name of the column to store the filename within the result DataFrame.
record_limit (int, optional): Limit the number of records to extract from each file.
Returns:
Expand Down Expand Up @@ -176,7 +176,7 @@ def download_and_extract(
iterator: DocumentIterator,
extractor: DocumentExtractor,
output_format: dict,
output_type: str = "jsonl",
output_type: Literal["jsonl", "parquet"] = "jsonl",
keep_raw_download: bool = False,
force_download: bool = False,
input_meta: Union[str, dict] = None,
Expand Down Expand Up @@ -211,8 +211,8 @@ def download_and_extract(
output_format (dict):
A dictionary mapping column names to the data types for the
extracted records.
output_type (str, optional):
The output file format/extension (e.g., "jsonl" or "parquet").
output_type (Literal["jsonl", "parquet"], optional):
The output file format/extension. Must be either "jsonl" or "parquet".
Defaults to "jsonl". This parameter is only used to verify whether
an extracted output already exists.
keep_raw_download (bool, optional):
Expand Down
6 changes: 3 additions & 3 deletions nemo_curator/download/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
import subprocess
import xml.etree.cElementTree as etree
from typing import Optional
from typing import Literal, Optional
from urllib.parse import quote, urlparse

import mwparserfromhell
Expand Down Expand Up @@ -750,7 +750,7 @@ def download_wikipedia(
output_path: str,
language: str = "en",
dump_date: Optional[str] = None,
output_type: str = "jsonl",
output_type: Literal["jsonl", "parquet"] = "jsonl",
raw_download_dir: Optional[str] = None,
keep_raw_download: bool = False,
force_download: bool = False,
Expand All @@ -771,7 +771,7 @@ def download_wikipedia(
language (str, optional): The language code for the Wikipedia dump to download. Default is "en".
dump_date (Optional[str], optional): The dump date in "YYYYMMDD" format. If None, the latest
available dump is downloaded.
output_type (str, optional): The file format/extension for saving the extracted documents (e.g., "jsonl").
output_type (Literal["jsonl", "parquet"], optional): The file format/extension for saving the extracted documents (e.g., "jsonl").
Defaults to "jsonl". This is not used for the output file, but is used to check if an extracted output
already exists and read it if so.
raw_download_dir (Optional[str], optional): Directory used for temporary storage of raw bz2 dump files.
Expand Down

0 comments on commit 2bfa3b6

Please sign in to comment.