Clarify typing on output types

Signed-off-by: Ryan Wolf <[email protected]>
NVIDIA · Feb 13, 2025 · 2bfa3b6 · 2bfa3b6
1 parent 8ef224c
commit 2bfa3b6
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 17 deletions.
diff --git a/nemo_curator/download/arxiv.py b/nemo_curator/download/arxiv.py
@@ -18,7 +18,7 @@
 import subprocess
 import tarfile
 import tempfile
-from typing import Optional
+from typing import Literal, Optional
 
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.download.doc_builder import (
@@ -366,7 +366,7 @@ def _build_non_arg_macros_dict(self, file_content):
 
 def download_arxiv(
     output_path: str,
-    output_type: str = "jsonl",
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
     raw_download_dir: Optional[str] = None,
     keep_raw_download: bool = False,
     force_download: bool = False,
@@ -384,7 +384,7 @@ def download_arxiv(
         output_path (str):
             The root directory where both the final extracted files and the raw download subdirectory will be stored.
             The extracted files (in the format specified by output_type) are eventually saved in this directory.
-        output_type (str, optional):
+        output_type (Literal["jsonl", "parquet"], optional):
             The file format/extension used for saving the extracted documents (e.g., "jsonl" or "parquet").
             Default is "jsonl". This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
         raw_download_dir (Optional[str], optional):

diff --git a/nemo_curator/download/commoncrawl.py b/nemo_curator/download/commoncrawl.py
@@ -17,7 +17,7 @@
 import subprocess
 import unicodedata
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Literal, Optional
 from urllib.parse import urlparse
 
 import justext
@@ -363,7 +363,7 @@ def download_common_crawl(
     output_path: str,
     start_snapshot: str,
     end_snapshot: str,
-    output_type: str = "jsonl",
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
     algorithm=JusTextExtractor(),
     news: bool = False,
     aws: bool = False,
@@ -385,8 +385,8 @@ def download_common_crawl(
           • For CC-MAIN datasets, use the format 'YYYY-WeekNumber' (e.g., '2020-50' or '2021-04').
           • For CC-NEWS datasets (when news=True), use the 'YYYY-MM' (Year-Month) format.
       end_snapshot (str): Identifier for the latest snapshot to process, which must be chronologically after start_snapshot.
-      output_type (str): The file format for the extracted output (e.g., "jsonl").
-          • This is not used for the output file, but is used to check if an extracted output already exists and read it if so.
+      output_type (Literal["jsonl", "parquet"]): The file format for the extracted output. Must be either "jsonl" or "parquet".
+          • This is not used for the output file, but is used to check if an extracted output already exists.
       algorithm: The text extraction algorithm instance (e.g., JusTextExtractor or ResiliparseExtractor) to use for HTML processing.
       news (bool): When True, indicates that URLs should be retrieved from the CC-NEWS dataset.
           • This also means snapshot identifiers should follow the 'YYYY-MM' format.

diff --git a/nemo_curator/download/doc_builder.py b/nemo_curator/download/doc_builder.py
@@ -15,7 +15,7 @@
 import importlib
 import os
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 
 import dask.dataframe as dd
 import pandas as pd
@@ -105,7 +105,7 @@ def _download_and_extract_single_partition(
     downloader: DocumentDownloader,
     iterator: DocumentIterator,
     extractor: DocumentExtractor,
-    output_type: str,
+    output_type: Literal["jsonl", "parquet"],
     keep_raw_download: bool,
     force_download: bool,
     input_meta: Union[str, dict] = None,
@@ -123,10 +123,10 @@ def _download_and_extract_single_partition(
         downloader (DocumentDownloader): An object to download the content from the URL.
         iterator (DocumentIterator): An object to iterate over records in the downloaded file.
         extractor (DocumentExtractor): An object to extract the desired content from each record.
-        output_type (str): A string specifying the output file type (e.g., "jsonl").
+        output_type (Literal["jsonl", "parquet"]): The output file format/extension. Must be either "jsonl" or "parquet". Defaults to "jsonl". This parameter is only used to verify whether an extracted output already exists.
         keep_raw_download (bool): If False, deletes the raw download file after extraction.
         force_download (bool): If False and output_path exists, skips downloading and extraction.
-        input_meta (Union[str, dict], optional): Metadata describing the input file’s structure.
+        input_meta (Union[str, dict], optional): Metadata describing the input file's structure.
         filename_col (str, optional): Name of the column to store the filename within the result DataFrame.
         record_limit (int, optional): Limit the number of records to extract from each file.
     Returns:
@@ -176,7 +176,7 @@ def download_and_extract(
     iterator: DocumentIterator,
     extractor: DocumentExtractor,
     output_format: dict,
-    output_type: str = "jsonl",
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
     keep_raw_download: bool = False,
     force_download: bool = False,
     input_meta: Union[str, dict] = None,
@@ -211,8 +211,8 @@ def download_and_extract(
         output_format (dict):
             A dictionary mapping column names to the data types for the
             extracted records.
-        output_type (str, optional):
-            The output file format/extension (e.g., "jsonl" or "parquet").
+        output_type (Literal["jsonl", "parquet"], optional):
+            The output file format/extension. Must be either "jsonl" or "parquet".
             Defaults to "jsonl". This parameter is only used to verify whether
             an extracted output already exists.
         keep_raw_download (bool, optional):

diff --git a/nemo_curator/download/wikipedia.py b/nemo_curator/download/wikipedia.py
@@ -18,7 +18,7 @@
 import re
 import subprocess
 import xml.etree.cElementTree as etree
-from typing import Optional
+from typing import Literal, Optional
 from urllib.parse import quote, urlparse
 
 import mwparserfromhell
@@ -750,7 +750,7 @@ def download_wikipedia(
     output_path: str,
     language: str = "en",
     dump_date: Optional[str] = None,
-    output_type: str = "jsonl",
+    output_type: Literal["jsonl", "parquet"] = "jsonl",
     raw_download_dir: Optional[str] = None,
     keep_raw_download: bool = False,
     force_download: bool = False,
@@ -771,7 +771,7 @@ def download_wikipedia(
         language (str, optional): The language code for the Wikipedia dump to download. Default is "en".
         dump_date (Optional[str], optional): The dump date in "YYYYMMDD" format. If None, the latest
             available dump is downloaded.
-        output_type (str, optional): The file format/extension for saving the extracted documents (e.g., "jsonl").
+        output_type (Literal["jsonl", "parquet"], optional): The file format/extension for saving the extracted documents (e.g., "jsonl").
             Defaults to "jsonl". This is not used for the output file, but is used to check if an extracted output
             already exists and read it if so.
         raw_download_dir (Optional[str], optional): Directory used for temporary storage of raw bz2 dump files.