From 48de110a0fc11f8a699cd44d01ff8b6005a62722 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 18 Feb 2025 10:00:18 -0800 Subject: [PATCH 1/4] Fix code blocks in documentation Signed-off-by: Sarah Yurick --- docs/user-guide/download.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index d2b50e0c..a6074bc3 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -37,6 +37,7 @@ By "extraction", we typically mean the process of converting a data format from Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it: .. code-block:: python + import os from nemo_curator import get_client from nemo_curator.download import download_common_crawl @@ -82,6 +83,7 @@ By "extraction", we typically mean the process of converting a data format from You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below. .. code-block:: python + import os from nemo_curator import get_client from nemo_curator.download import ( From ca72c86b878b74075320eb0d9a898d4ce920c4ba Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 18 Feb 2025 10:13:33 -0800 Subject: [PATCH 2/4] spacing Signed-off-by: Sarah Yurick --- docs/user-guide/download.rst | 166 +++++++++++++++++------------------ 1 file changed, 83 insertions(+), 83 deletions(-) diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index a6074bc3..56654a44 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -36,44 +36,44 @@ By "extraction", we typically mean the process of converting a data format from Common crawl has an S3 bucket and a direct HTTPS endpoint. If you want to use the S3 bucket, ensure you have properly set up your credentials with `s5cmd `_. Otherwise, the HTTPS endpoints will be used with ``wget``. Here is a small example of how to use it: - .. code-block:: python - - import os - from nemo_curator import get_client - from nemo_curator.download import download_common_crawl - from nemo_curator.datasets import DocumentDataset - - def main(): - # Initialize a distributed Dask client - client = get_client(cluster_type="cpu") - - # Parameters for downloading Common Crawl data. - # - output_folder: directory for temporary download/extraction files - # - start_snapshot and end_snapshot define the range to fetch - # - output_type: specifies file format for the extracted data (e.g., "jsonl") - output_folder = "/extracted/output/folder" - start_snapshot = "2020-50" - end_snapshot = "2021-04" - output_type = "jsonl" - os.makedirs(output_folder, exist_ok=True) - - # Download and extract the Common Crawl data. - # The function returns a DocumentDataset that contains the extracted documents. - # Note: The output folder and output type are passed here to store intermediate files - # and check if the data has already been downloaded. They should match the final location - # and format of the extracted data. - common_crawl_dataset = download_common_crawl( - output_folder, start_snapshot, end_snapshot, output_type=output_type - ) - - # Write the extracted dataset to JSON format. - # The 'to_json' method will write one JSON document per line, - # preserving the original shard information if write_to_filename is True. - common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True) - print("Extracted dataset saved to:", output_folder) - - if __name__ == "__main__": - main() +.. code-block:: python + + import os + from nemo_curator import get_client + from nemo_curator.download import download_common_crawl + from nemo_curator.datasets import DocumentDataset + + def main(): + # Initialize a distributed Dask client + client = get_client(cluster_type="cpu") + + # Parameters for downloading Common Crawl data. + # - output_folder: directory for temporary download/extraction files + # - start_snapshot and end_snapshot define the range to fetch + # - output_type: specifies file format for the extracted data (e.g., "jsonl") + output_folder = "/extracted/output/folder" + start_snapshot = "2020-50" + end_snapshot = "2021-04" + output_type = "jsonl" + os.makedirs(output_folder, exist_ok=True) + + # Download and extract the Common Crawl data. + # The function returns a DocumentDataset that contains the extracted documents. + # Note: The output folder and output type are passed here to store intermediate files + # and check if the data has already been downloaded. They should match the final location + # and format of the extracted data. + common_crawl_dataset = download_common_crawl( + output_folder, start_snapshot, end_snapshot, output_type=output_type + ) + + # Write the extracted dataset to JSON format. + # The 'to_json' method will write one JSON document per line, + # preserving the original shard information if write_to_filename is True. + common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True) + print("Extracted dataset saved to:", output_folder) + + if __name__ == "__main__": + main() * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here `_. @@ -82,51 +82,51 @@ By "extraction", we typically mean the process of converting a data format from You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below. - .. code-block:: python - - import os - from nemo_curator import get_client - from nemo_curator.download import ( - ResiliparseExtractor, - download_common_crawl, - ) - from nemo_curator.datasets import DocumentDataset - - def main(): - # Initialize a distributed Dask client - client = get_client(cluster_type="cpu") - - # Parameters for downloading Common Crawl data. - # - output_folder: directory for temporary download/extraction files - # - start_snapshot and end_snapshot define the range to fetch - # - output_type: specifies file format for the extracted data (e.g., "jsonl") - output_folder = "/extracted/output/folder" - start_snapshot = "2020-50" - end_snapshot = "2021-04" - output_type = "jsonl" - os.makedirs(output_folder, exist_ok=True) - - # Change the extraction algorithm to use ResiliparseExtractor - extraction_algorithm = ResiliparseExtractor() - - # Download and extract the Common Crawl data using the Resiliparse extraction algorithm. - # The function returns a DocumentDataset that contains the extracted documents. - common_crawl_dataset = download_common_crawl( - output_folder, - start_snapshot, - end_snapshot, - output_type=output_type, - algorithm=extraction_algorithm, - ) - - # Write the extracted dataset to JSON format. - # The 'to_json' method writes one JSON document per line, - # preserving the original shard information if write_to_filename is True. - common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True) - print("Extracted dataset saved to:", output_folder) - - if __name__ == "__main__": - main() +.. code-block:: python + + import os + from nemo_curator import get_client + from nemo_curator.download import ( + ResiliparseExtractor, + download_common_crawl, + ) + from nemo_curator.datasets import DocumentDataset + + def main(): + # Initialize a distributed Dask client + client = get_client(cluster_type="cpu") + + # Parameters for downloading Common Crawl data. + # - output_folder: directory for temporary download/extraction files + # - start_snapshot and end_snapshot define the range to fetch + # - output_type: specifies file format for the extracted data (e.g., "jsonl") + output_folder = "/extracted/output/folder" + start_snapshot = "2020-50" + end_snapshot = "2021-04" + output_type = "jsonl" + os.makedirs(output_folder, exist_ok=True) + + # Change the extraction algorithm to use ResiliparseExtractor + extraction_algorithm = ResiliparseExtractor() + + # Download and extract the Common Crawl data using the Resiliparse extraction algorithm. + # The function returns a DocumentDataset that contains the extracted documents. + common_crawl_dataset = download_common_crawl( + output_folder, + start_snapshot, + end_snapshot, + output_type=output_type, + algorithm=extraction_algorithm, + ) + + # Write the extracted dataset to JSON format. + # The 'to_json' method writes one JSON document per line, + # preserving the original shard information if write_to_filename is True. + common_crawl_dataset.to_json(output_path=output_folder, write_to_filename=True) + print("Extracted dataset saved to:", output_folder) + + if __name__ == "__main__": + main() Above, we changed the extraction algorithm from the default ``JusTextExtractor``. From 5b0e417272d185f47ea6e6e29c2fe9504de9a8b6 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 18 Feb 2025 10:16:00 -0800 Subject: [PATCH 3/4] more spacing Signed-off-by: Sarah Yurick --- docs/user-guide/download.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index 56654a44..b47f6ce5 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -75,6 +75,7 @@ By "extraction", we typically mean the process of converting a data format from if __name__ == "__main__": main() + * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here `_. * ``"2021-04"`` is the last common crawl snapshot that will be included in the download. @@ -128,6 +129,7 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c if __name__ == "__main__": main() + Above, we changed the extraction algorithm from the default ``JusTextExtractor``. The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. From 07a7a78fd6ae78969f91fe7c8bfe4c5187e9a8db Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Tue, 18 Feb 2025 10:18:26 -0800 Subject: [PATCH 4/4] more spacing Signed-off-by: Sarah Yurick --- docs/user-guide/download.rst | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/user-guide/download.rst b/docs/user-guide/download.rst index b47f6ce5..72af8e61 100644 --- a/docs/user-guide/download.rst +++ b/docs/user-guide/download.rst @@ -75,11 +75,10 @@ By "extraction", we typically mean the process of converting a data format from if __name__ == "__main__": main() - - * ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. - * ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here `_. - * ``"2021-04"`` is the last common crawl snapshot that will be included in the download. - * ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported. +* ``"/extracted/output/folder"`` is the path to on your local filesystem where the final extracted files will be placed. +* ``"2020-50"`` is the first common crawl snapshot that will be included in the download. **Note:** Not every year and week has a snapshot. Ensure that your range includes at least one valid Common Crawl snapshot. A list of valid Common Crawl snapshots can be found `here `_. +* ``"2021-04"`` is the last common crawl snapshot that will be included in the download. +* ``output_type="jsonl"`` is the file format that will be used for storing the data on disk. Currently ``"jsonl"`` and ``"parquet"`` are supported. You can choose to modify the HTML text extraction algorithm used in ``download_common_crawl``. See an example below. @@ -129,12 +128,11 @@ You can choose to modify the HTML text extraction algorithm used in ``download_c if __name__ == "__main__": main() +Above, we changed the extraction algorithm from the default ``JusTextExtractor``. - Above, we changed the extraction algorithm from the default ``JusTextExtractor``. - - The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. +The return value ``common_crawl`` will be in NeMo Curator's standard ``DocumentDataset`` format. Check out the function's docstring for more parameters you can use. - NeMo Curator's Common Crawl extraction process looks like this under the hood: +NeMo Curator's Common Crawl extraction process looks like this under the hood: 1. Decode the HTML within the record from binary to text. 2. If the HTML can be properly decoded, then with `pyCLD2 `_, perform language detection on the input HTML.