diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml index 6f14b414..26699e3d 100644 --- a/.github/workflows/gpuci.yml +++ b/.github/workflows/gpuci.yml @@ -7,12 +7,12 @@ on: pull_request: branches: # We can run gpuCI on any PR targeting these branches - - 'main' - - '[rv][0-9].[0-9].[0-9]' - - '[rv][0-9].[0-9].[0-9]rc[0-9]' + - "main" + - "[rv][0-9].[0-9].[0-9]" + - "[rv][0-9].[0-9].[0-9]rc[0-9]" # PR has to be labeled with "gpuCI" label # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI - types: [ labeled ] + types: [labeled] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -40,50 +40,52 @@ jobs: # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners # It has 2 A100 GPUs runs-on: self-hosted-azure + # Unit tests shouldn't take longer than 30minutes + timeout-minutes: 30 # "run-gpu-tests" job is run if the "gpuci" label is added to the PR if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} steps: # If something went wrong during the last cleanup, this step ensures any existing container is removed - - name: Remove existing container if it exists - run: | - if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then - docker rm -f nemo-curator-container - fi + - name: Remove existing container if it exists + run: | + if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then + docker rm -f nemo-curator-container + fi - # This runs the container which was pushed by build-container, which we call "nemo-curator-container" - # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container - # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with - # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting - - name: Run Docker container - run: | - docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" + # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container + # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with + # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting + - name: Run Docker container + run: | + docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" - # Expect `whoami` to be "azureuser" - # Expect `nvidia-smi` to show our 2 A100 GPUs - - name: Check GPUs - run: | - whoami - docker exec nemo-curator-container nvidia-smi + # Expect `whoami` to be "azureuser" + # Expect `nvidia-smi` to show our 2 A100 GPUs + - name: Check GPUs + run: | + whoami + docker exec nemo-curator-container nvidia-smi - # In the virtual environment (called "curator") we created in the container, - # list all of our packages. Useful for debugging - - name: Verify installations - run: | - docker exec nemo-curator-container pip list + # In the virtual environment (called "curator") we created in the container, + # list all of our packages. Useful for debugging + - name: Verify installations + run: | + docker exec nemo-curator-container pip list - # In the virtual environment (called "curator") we created in the container, - # run our PyTests marked with `@pytest.mark.gpu` - # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), - # and then the directory where the PyTests are located - - name: Run PyTests with GPU mark - run: | - docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests + # In the virtual environment (called "curator") we created in the container, + # run our PyTests marked with `@pytest.mark.gpu` + # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), + # and then the directory where the PyTests are located + - name: Run PyTests with GPU mark + run: | + docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests - # After running `docker stop`, the container remains in an exited state - # It is still present on our system and could be restarted with `docker start` - # Thus, we use `docker rm` to permanently removed it from the system - - name: Cleanup - if: always() - run: | - docker stop nemo-curator-container && docker rm nemo-curator-container + # After running `docker stop`, the container remains in an exited state + # It is still present on our system and could be restarted with `docker start` + # Thus, we use `docker rm` to permanently removed it from the system + - name: Cleanup + if: always() + run: | + docker stop nemo-curator-container && docker rm nemo-curator-container diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 69861dcc..fb86f4da 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -1,4 +1,4 @@ -name: 'Code freeze' +name: "Code freeze" on: workflow_dispatch: @@ -9,14 +9,20 @@ on: options: - major - minor + freeze-commit: + type: string + description: Commit SHA to use for cut-off + required: false + default: main jobs: code-freeze: - uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.12.0 + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.21.6 with: library-name: NeMo Curator python-package: nemo_curator release-type: ${{ inputs.release-type }} - + freeze-commit: ${{ inputs.freeze-commit }} secrets: SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} + SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} diff --git a/README.md b/README.md index 77b32836..e5fc03c0 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ This section explains how to install NeMo Curator and use the Python library, Py Before installing NeMo Curator, ensure that the following requirements are met: - Python 3.10 or higher + - packaging >= 22.0 - Ubuntu 22.04/20.04 - NVIDIA GPU (optional) - Volta™ or higher ([compute capability 7.0+](https://developer.nvidia.com/cuda-gpus)) @@ -187,7 +188,11 @@ The following figure shows that the use of different data curation modules imple drawing

-In terms of scalability and compute performance, using the combination of RAPIDS and Dask fuzzy deduplication enabled us to deduplicate the 1.1 Trillion token Red Pajama dataset in 1.8 hours with 64 NVIDIA A100 Tensor Core GPUs. +In terms of scalability and compute performance, using the combination of RAPIDS and Dask fuzzy deduplication enabled us to deduplicate the 1.96 Trillion token subset of the RedPajama V2 dataset in 0.5 hours with 32 NVIDIA H100 GPUs. + +Processing Time | Comparison to Alternative Libraries +:-------------------------:|:---------------------------------------: +![](./docs/user-guide/assets/readme/fuzzy-dedup-processing-time.png) | ![](./docs/user-guide/assets/readme/fuzzy-dedup-processing-optimization-16x.png) Additionally, using the CPU-based modules, the following table shows the time required and resulting data size reduction for each processing step [Common Crawl snapshot from November/December of 2020](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)). diff --git a/conftest.py b/conftest.py index 451ae5af..7aa3adb0 100644 --- a/conftest.py +++ b/conftest.py @@ -1,4 +1,11 @@ import pytest +from dask.distributed import Client + +from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from + +cudf = gpu_only_import("cudf") +dask_cudf = gpu_only_import("dask_cudf") +LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") def pytest_addoption(parser): @@ -13,3 +20,16 @@ def pytest_collection_modifyitems(config, items): for item in items: if "gpu" in item.keywords: item.add_marker(skip_gpu) + + +@pytest.fixture(autouse=True, scope="session") +def gpu_client(request): + if not request.config.getoption("--cpu"): + with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: + request.session.client = client + request.session.cluster = cluster + yield client + client.close() + cluster.close() + else: + yield None diff --git a/docs/user-guide/api/classifiers.rst b/docs/user-guide/api/classifiers.rst index 8d5da2ea..1dad8e23 100644 --- a/docs/user-guide/api/classifiers.rst +++ b/docs/user-guide/api/classifiers.rst @@ -14,6 +14,12 @@ Classifiers .. autoclass:: nemo_curator.classifiers.FineWebEduClassifier :members: +.. autoclass:: nemo_curator.classifiers.FineWebMixtralEduClassifier + :members: + +.. autoclass:: nemo_curator.classifiers.FineWebNemotronEduClassifier + :members: + .. autoclass:: nemo_curator.classifiers.AegisClassifier :members: diff --git a/docs/user-guide/api/filters.rst b/docs/user-guide/api/filters.rst index 55b78ed7..24678b73 100644 --- a/docs/user-guide/api/filters.rst +++ b/docs/user-guide/api/filters.rst @@ -152,6 +152,14 @@ Heuristic Filters :members: :member-order: bysource +.. autoclass:: nemo_curator.filters.TokenCountFilter + :members: + :member-order: bysource + +.. autoclass:: nemo_curator.filters.SubstringFilter + :members: + :member-order: bysource + ------------------------------ Code Filters ------------------------------ diff --git a/docs/user-guide/api/misc.rst b/docs/user-guide/api/misc.rst index b4785f02..9872cb85 100644 --- a/docs/user-guide/api/misc.rst +++ b/docs/user-guide/api/misc.rst @@ -15,3 +15,9 @@ Miscellaneous .. autoclass:: nemo_curator.Shuffle :members: + +.. autoclass:: nemo_curator.DocumentSplitter + :members: + +.. autoclass:: nemo_curator.DocumentJoiner + :members: diff --git a/docs/user-guide/api/modifiers.rst b/docs/user-guide/api/modifiers.rst index 6e5f506e..252803a2 100644 --- a/docs/user-guide/api/modifiers.rst +++ b/docs/user-guide/api/modifiers.rst @@ -32,3 +32,22 @@ Modifiers .. autoclass:: nemo_curator.modifiers.PiiModifier :members: + +.. autoclass:: nemo_curator.modifiers.LineRemover + :members: + +.. autoclass:: nemo_curator.modifiers.MarkdownRemover + :members: + +.. autoclass:: nemo_curator.modifiers.NewlineNormalizer + :members: + +.. autoclass:: nemo_curator.modifiers.UrlRemover + :members: + +.. autoclass:: nemo_curator.modifiers.Slicer + :members: + +.. autoclass:: nemo_curator.modifiers.QuotationRemover + :members: + diff --git a/docs/user-guide/api/synthetic.rst b/docs/user-guide/api/synthetic.rst index 685656b4..4e13e64b 100644 --- a/docs/user-guide/api/synthetic.rst +++ b/docs/user-guide/api/synthetic.rst @@ -8,6 +8,18 @@ Synthetic Data .. autoclass:: nemo_curator.synthetic.AsyncNemotronGenerator :members: +.. autoclass:: nemo_curator.synthetic.NemotronCCGenerator + :members: + +.. autoclass:: nemo_curator.synthetic.NemotronCCDiverseQAPostprocessor + :members: + +.. autoclass:: nemo_curator.synthetic.NemotronCCKnowledgeListPostprocessor + :members: + +.. autoclass:: nemo_curator.synthetic.AsyncNemotronGenerator + :members: + .. autoclass:: nemo_curator.synthetic.NemotronFormatter :members: diff --git a/docs/user-guide/assets/readme/fuzzy-dedup-processing-optimization-16x.png b/docs/user-guide/assets/readme/fuzzy-dedup-processing-optimization-16x.png new file mode 100644 index 00000000..093e3771 Binary files /dev/null and b/docs/user-guide/assets/readme/fuzzy-dedup-processing-optimization-16x.png differ diff --git a/docs/user-guide/assets/readme/fuzzy-dedup-processing-time.png b/docs/user-guide/assets/readme/fuzzy-dedup-processing-time.png new file mode 100644 index 00000000..eb0b33c5 Binary files /dev/null and b/docs/user-guide/assets/readme/fuzzy-dedup-processing-time.png differ diff --git a/docs/user-guide/cpuvsgpu.rst b/docs/user-guide/cpuvsgpu.rst index bdc3e483..096ba28c 100644 --- a/docs/user-guide/cpuvsgpu.rst +++ b/docs/user-guide/cpuvsgpu.rst @@ -71,6 +71,7 @@ The following NeMo Curator modules are GPU based. * Quality Classification * AEGIS and Instruction Data Guard Safety Models * FineWeb Educational Content Classification + * FineWeb Mixtral and FineWeb Nemotron-4 Educational Models * Content Type Classification * Prompt Task and Complexity Classification diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst index 389e8ef1..d8021de2 100644 --- a/docs/user-guide/distributeddataclassification.rst +++ b/docs/user-guide/distributeddataclassification.rst @@ -31,6 +31,10 @@ Here, we summarize why each is useful for training an LLM: - The **FineWeb Educational Content Classifier** focuses on identifying and prioritizing educational material within datasets. This classifier is especially useful for training LLMs on specialized educational content, which can improve their performance on knowledge-intensive tasks. Models trained on high-quality educational content demonstrate enhanced capabilities on academic benchmarks such as MMLU and ARC, showcasing the classifier's impact on improving the knowledge-intensive task performance of LLMs. +- The **FineWeb Mixtral Educational Classifier** is designed to determine the educational value (score 0-5 from low to high). It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Mixtral 8x22B-Instruct. + +- The **FineWeb Nemotron-4 Educational Classifier** is designed to determine the educational value (score 0-5 from low to high). It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Nemotron-4-340B-Instruct. + - The **Content Type Classifier** is designed to categorize documents into one of 11 distinct speech types based on their content. It analyzes and understands the nuances of textual information, enabling accurate classification across a diverse range of content types. - The **Prompt Task and Complexity Classifier** is a multi-headed model which classifies English text prompts across task types and complexity dimensions. @@ -236,6 +240,92 @@ For example, to create a dataset with only highly educational content (scores 4 high_edu_dataset = result_dataset[result_dataset["fineweb-edu-score-int"] >= 4] high_edu_dataset.to_json("high_educational_content/") +FineWeb Mixtral Edu Classifier +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The FineWeb Mixtral Edu Classifier is designed to identify and prioritize educational content within a dataset. +It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Mixtral 8x22B-Instruct. +In contrast, the original FineWeb-Edu classifier was trained using annotations from Llama 3 70B-Instruct. +This classifier was used as part of a classifier ensemble in the creation of the `Nemotron-CC dataset `_. +These datasets can be used to train LLMs with a focus on educational content, potentially improving their performance on knowledge-intensive tasks. + +To use the FineWeb Mixtral Edu Classifier, you can follow this example: + +.. code-block:: python + + from nemo_curator.classifiers import FineWebMixtralEduClassifier + + files = get_all_files_paths_under("web_documents/") + input_dataset = DocumentDataset.read_json(files, backend="cudf") + + classifier = FineWebMixtralEduClassifier( + batch_size=256, + text_field="text", + pred_column="fineweb-mixtral-edu-score", + int_column="fineweb-mixtral-edu-score-int", + quality_label_column="fineweb-mixtral-edu-score-label", + ) + result_dataset = classifier(dataset=input_dataset) + + result_dataset.to_json("educational_content/") + +This classifier uses a model based on the `Snowflake Arctic-embed-m `_ embedding model with a linear regression layer on top. +It assigns an educational score to each document on a scale from 0 to 5, where higher scores indicate more educational content. + +The ``pred_column`` will contain the raw floating-point scores, while the ``int_column`` will contain the rounded integer scores. +The ``quality_label_column`` identifies text as high quality if it scores higher than 2.5 and low quality otherwise. +You can filter the results based on these scores to create datasets with varying levels of educational content. + +For example, to create a dataset with only highly educational content (scores 4 and 5): + +.. code-block:: python + + high_edu_dataset = result_dataset[result_dataset["fineweb-mixtral-edu-score-int"] >= 4] + high_edu_dataset.to_json("high_educational_content/") + +FineWeb Nemotron-4 Edu Classifier +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The FineWeb Mixtral Edu Classifier is designed to identify and prioritize educational content within a dataset. +It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Nemotron-4-340B-Instruct. +In contrast, the original FineWeb-Edu classifier was trained using annotations from Llama 3 70B-Instruct. +This classifier was used as part of a classifier ensemble in the creation of the `Nemotron-CC dataset `_. +These datasets can be used to train LLMs with a focus on educational content, potentially improving their performance on knowledge-intensive tasks. + +To use the FineWeb Nemotron-4 Edu Classifier, you can follow this example: + +.. code-block:: python + + from nemo_curator.classifiers import FineWebNemotronEduClassifier + + files = get_all_files_paths_under("web_documents/") + input_dataset = DocumentDataset.read_json(files, backend="cudf") + + classifier = FineWebNemotronEduClassifier( + batch_size=256, + text_field="text", + pred_column="fineweb-nemotron-edu-score", + int_column="fineweb-nemotron-edu-score-int", + quality_label_column="fineweb-nemotron-edu-score-label", + ) + result_dataset = classifier(dataset=input_dataset) + + result_dataset.to_json("educational_content/") + +This classifier uses a model based on the `Snowflake Arctic-embed-m `_ embedding model with a linear regression layer on top. +It assigns an educational score to each document on a scale from 0 to 5, where higher scores indicate more educational content. + +The ``pred_column`` will contain the raw floating-point scores, while the ``int_column`` will contain the rounded integer scores. +The ``quality_label_column`` identifies text as high quality if it scores higher than 2.5 and low quality otherwise. +You can filter the results based on these scores to create datasets with varying levels of educational content. + +For example, to create a dataset with only highly educational content (scores 4 and 5): + +.. code-block:: python + + high_edu_dataset = result_dataset[result_dataset["fineweb-nemotron-edu-score-int"] >= 4] + high_edu_dataset.to_json("high_educational_content/") + Content Type Classifier DeBERTa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst index 38adbfd6..c5cfd4d1 100644 --- a/docs/user-guide/gpudeduplication.rst +++ b/docs/user-guide/gpudeduplication.rst @@ -63,14 +63,19 @@ After ensuring your dataset has a unique ID field (or creating one with the code from nemo_curator.datasets import DocumentDataset # Initialize the deduplication object - ExactDups = ExactDuplicates(id_field="my_id", text_field="text") + exact_duplicates = ExactDuplicates( + id_field="my_id", + text_field="text", + perform_removal=True, + cache_dir="/path/to/dedup_outputs", # Recommended to specify a cache_dir if perform_removal=True + ) dataset = DocumentDataset.read_parquet( input_files="/path/to/parquet/data", backend="cudf", # or "pandas" for CPU ) - - duplicate_docs = ExactDups(dataset) + # Users who have specified perform_removal=False can split as following + duplicate_docs = exact_duplicates.identify_duplicates(dataset) """ Sample output: @@ -82,9 +87,14 @@ After ensuring your dataset has a unique ID field (or creating one with the code 107 doc_prefix-52271 0f763a2937d57b9d96bf9f220e55f2bd """ + deduplicated_dataset = exact_duplicates.remove(dataset, duplicate_docs) + + # Users who have specified perform_removal=True can get the output deduplicated dataset directly as follows + # deduplicated_dataset = exact_duplicates(dataset) + + .. tip:: - A more comprehensive example, including how to remove documents from a corpus using the list of - duplicate IDs generated from the exact deduplication step above, can be found in `examples/exact_deduplication.py `_. + A more comprehensive example, can be found in `examples/exact_deduplication.py `_. """""""""""" CLI Utility @@ -187,6 +197,7 @@ Python API cache_dir="/path/to/dedup_outputs", # must be cleared between runs id_field="my_id", text_field="text", + perform_removal=False, # dictates if deduplicated dataset or IDs of duplicates are returned seed=42, char_ngrams=24, num_buckets=20, @@ -203,6 +214,7 @@ Python API cache_dir: /path/to/dedup_outputs id_field: my_id text_field: text + perform_removal: False seed: 42 char_ngrams: 24 num_buckets: 20 @@ -226,14 +238,15 @@ Python API from nemo_curator.datasets import DocumentDataset # Initialize the deduplication object - FuzzyDups = FuzzyDuplicates(config=config, logger="./") + fuzzy_duplicates = FuzzyDuplicates(config=config, logger="./") dataset = DocumentDataset.read_json( input_files="/path/to/jsonl/data", backend="cudf", # FuzzyDuplicates only supports datasets with the cuDF backend. ) - duplicate_docs = FuzzyDups(dataset) + # Users who have specified perform_removal=False can split as following + duplicate_docs = fuzzy_duplicates.identify_duplicates(dataset) """ Sample output: my_id group @@ -244,10 +257,15 @@ Python API 4 doc_prefix-42050 154 """ + deduplicated_dataset = fuzzy_duplicates.remove(dataset, duplicate_docs) + + # Users who have specified perform_removal=True can get the output deduplicated dataset directly as follows + # deduplicated_dataset = fuzzy_duplicates(dataset) + + .. tip:: - - A more comprehensive example for the above, including how to remove documents from a corpus using the list of - duplicate IDs generated from fuzzy deduplication, can be found in `examples/fuzzy_deduplication.py `_. + - A comprehensive example can be found in `examples/fuzzy_deduplication.py `_. - The default values of ``num_buckets`` and ``hashes_per_bucket`` are set to find documents with an approximately Jaccard similarity of 0.8 or above. - Higher ``buckets_per_shuffle`` values can lead to better performance but might lead to out of memory errors. - Setting the ``false_positive_check`` flag to ``False`` is ideal for optimal performance. diff --git a/docs/user-guide/image/gettingstarted.rst b/docs/user-guide/image/gettingstarted.rst index 49248bc7..075b3040 100644 --- a/docs/user-guide/image/gettingstarted.rst +++ b/docs/user-guide/image/gettingstarted.rst @@ -13,6 +13,7 @@ Install NeMo Curator To install the image curation modules of NeMo Curator, ensure you meet the following requirements: * Python 3.10 or higher + * packaging >= 22.0 * Ubuntu 22.04/20.04 * NVIDIA GPU * Volta™ or higher (compute capability 7.0+) diff --git a/docs/user-guide/syntheticdata.rst b/docs/user-guide/syntheticdata.rst index d082ae5f..43c017c2 100644 --- a/docs/user-guide/syntheticdata.rst +++ b/docs/user-guide/syntheticdata.rst @@ -15,6 +15,7 @@ Furthermore, NeMo Curator can also interface with `NeMo's Export and Deploy `_. +It also now supports the pipelines used in generating `Nemotron-CC `_. Additionally, you can seamlessly integrate filtering and deduplication steps in your synthetic data pipeline with the other modules available in NeMo Curator. Connect to an LLM Service @@ -690,6 +691,295 @@ All of the code so far has been sending requests to the LLM service synchronousl As you can see, the asynchronous modules have the same interface as the synchronous modules. The only exception is that a ``max_concurrent_requests`` parameter can be supplied to the constructor of ``AsyncNemotronGenerator`` as a form of rate limiting if your service is rate limited. +Customize the Nemotron-CC Pipeline +----------------------------------- + +Nemotron-CC is an open, large, high-quality English Common Crawl dataset that enables pretraining highly accurate LLMs over both short and long token horizons. + +You can use the Nemotron-CC pipeline collection to rewrite reference documents into different formats and styles. For example, you can rephrase short sentences with simple diction into technical, scholarly prose (like Wikipedia) or distill wandering paragraphs into condensed bulleted lists. + +NeMo Curator provides two versions of each pipeline: + +* **Synchronous**: ``nemo_curator.synthetic.NemotronCCGenerator`` +* **Asynchronous**: ``nemo_curator.synthetic.AsyncNemotronCCGenerator`` + +Rewrite to Wikipedia Style +########################## + +Use the ``NemotronCCGenerator.rewrite_to_wikipedia_style`` method to rewrite a document into a style that is similar to Wikipedia in terms of line spacing, punctuation, and style. + +.. code-block:: python + + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.synthetic import NemotronCCGenerator + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 512, + } + + responses = generator.rewrite_to_wikipedia_style( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # The lunar surface has a high albedo, which means it reflects a significant amount of sunlight. + + +Generate Diverse QA Pairs +######################### + +Use the ``NemotronCCGenerator.generate_diverse_qa`` method to generate a list of diverse QA pairs from a document. + +.. code-block:: python + + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.synthetic import NemotronCCGenerator + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 600, + } + + responses = generator.generate_diverse_qa( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # Question: What is the moon made of? + # Answer: The moon is made of rock and dust. + + +Postprocessor +^^^^^^^^^^^^^ + +You can optionally use the ``NemotronCCDiverseQAPostprocessor`` class to reformat the output. + +.. code-block:: python + + import pandas as pd + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.datasets import DocumentDataset + from nemo_curator.synthetic import NemotronCCGenerator, NemotronCCDiverseQAPostprocessor + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 600, + } + responses = generator.generate_diverse_qa(document=document, model=model, model_kwargs=model_kwargs) + postprocessor = NemotronCCDiverseQAPostprocessor(text_field="text", response_field="diverse_qa_response") + dataset = DocumentDataset.from_pandas(pd.DataFrame({"text": document, "diverse_qa_response": responses})) + + # This postprocessor will sample a random number of QA pairs up to max_num_pairs. + # If a tokenizer is provided, the number of QA pairs will be sampled from at least + # 1 and at most floor(max_num_pairs * num_tokens / 150). + # Otherwise, the number of QA pairs will be sampled randomly strictly up to max_num_pairs. + # The generated QA pairs are shuffled and then appended to the original text. + cleaned_dataset = postprocessor(dataset) + + first_entry = cleaned_dataset.df.head(1) + print(first_entry["diverse_qa_response"]) + # Output: + # The moon is bright. It shines at night. Question: What is the moon made of? Answer: The moon is made of rock and dust. + + +Generate Knowledge List +####################### + +Use the ``NemotronCCGenerator.generate_knowledge_list`` method to generate a list of knowledge from a document. + +.. code-block:: python + + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.synthetic import NemotronCCGenerator + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 600, + } + + responses = generator.generate_knowledge_list( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # - The moon is made of rock and dust. + # - The moon is the only natural satellite of the Earth. + # ... + +Postprocessor +^^^^^^^^^^^^^ + +You can optionally use the ``NemotronCCKnowledgeListPostprocessor`` class to reformat the output. + +.. code-block:: python + + import pandas as pd + from openai import OpenAI + + from nemo_curator import OpenAIClient + from nemo_curator.datasets import DocumentDataset + from nemo_curator.synthetic import NemotronCCGenerator, NemotronCCKnowledgeListPostprocessor + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 600, + } + + responses = generator.generate_knowledge_list( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # - The moon is made of rock and dust. + # - The moon is the only natural satellite of the Earth. + # ... + + postprocessor = NemotronCCKnowledgeListPostprocessor(text_field="knowledge_list_response") + dataset = DocumentDataset.from_pandas(pd.DataFrame({"knowledge_list_response": responses})) + + # This postprocessor removes formatting artifacts + # such as bullet point prefixes ("- ") and extra indentation from each line, + # ensuring that the final output is a clean, uniformly formatted list of knowledge items. + # The processing includes skipping any initial non-bullet line and merging related lines + # to reconstruct multi-line questions or answers. + cleaned_dataset = postprocessor(dataset) + + first_entry = cleaned_dataset.df.head(1) + print(first_entry["knowledge_list_response"]) + # Output: + # The moon is made of rock and dust. + # The moon is the only natural satellite of the Earth. + +Distill Document +################# + +Use the ``NemotronCCGenerator.distill`` method to make a document more concise. + +.. code-block:: python + + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.synthetic import NemotronCCGenerator + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = "The moon is bright. It shines at night." + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 1600, + } + + responses = generator.distill( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # The moon is bright at night. + + +Extract Knowledge +################ + +Use the ``NemotronCCGenerator.extract_knowledge`` method to extract knowledge from a document. + +.. code-block:: python + + from openai import OpenAI + from nemo_curator import OpenAIClient + from nemo_curator.synthetic import NemotronCCGenerator + + openai_client = OpenAI( + base_url="https://integrate.api.nvidia.com/v1", + api_key="" + ) + client = OpenAIClient(openai_client) + generator = NemotronCCGenerator(client) + + document = ("The moon is bright. It shines at night. I love the moon. I first saw it up" + " close through a telescope in 1999 at a sleepover.") + model = "nv-mistralai/mistral-nemo-12b-instruct" + model_kwargs = { + "temperature": 0.5, + "top_p": 0.9, + "max_tokens": 1400, + } + + responses = generator.extract_knowledge( + document=document, model=model, model_kwargs=model_kwargs + ) + + print(responses[0]) + # Output: + # The moon is a reflective body visible from the Earth at night. + + Combine Synthetic Data Generation with other NeMo Curator Modules ----------------------------------------------------------------- Synthetic data generation, unlike the rest of NeMo Curator, operates independently of Dask. diff --git a/examples/classifiers/README.md b/examples/classifiers/README.md index fad2a691..c086491b 100644 --- a/examples/classifiers/README.md +++ b/examples/classifiers/README.md @@ -8,6 +8,8 @@ The Python scripts in this directory demonstrate how to run classification on yo - AEGIS Safety Models - Instruction Data Guard Model - FineWeb Educational Content Classifier +- FineWeb Mixtral Educational Classifier +- FineWeb Nemotron-4 Educational Classifier - Content Type Classifier - Prompt Task and Complexity Classifier diff --git a/examples/classifiers/fineweb_mixtral_edu_example.py b/examples/classifiers/fineweb_mixtral_edu_example.py new file mode 100644 index 00000000..c38b4eb3 --- /dev/null +++ b/examples/classifiers/fineweb_mixtral_edu_example.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +from nemo_curator.classifiers import FineWebMixtralEduClassifier +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import ArgumentHelper + + +def main(args): + global_st = time.time() + + # Input can be a string or list + input_file_path = "/path/to/data" + output_file_path = "./" + + client_args = ArgumentHelper.parse_client_args(args) + client_args["cluster_type"] = "gpu" + client = get_client(**client_args) + + input_dataset = DocumentDataset.read_json( + input_file_path, backend="cudf", add_filename=True + ) + + fineweb_mixtral_edu_classifier = FineWebMixtralEduClassifier() + result_dataset = fineweb_mixtral_edu_classifier(dataset=input_dataset) + result_dataset.to_json(output_path=output_file_path, write_to_filename=True) + + global_et = time.time() + print( + f"Total time taken for FineWeb Mixtral Edu Classifier inference: {global_et-global_st} s", + flush=True, + ) + + client.close() + + +def attach_args( + parser=argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ), +): + argumentHelper = ArgumentHelper(parser) + argumentHelper.add_distributed_classifier_cluster_args() + + return argumentHelper.parser + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/classifiers/fineweb_nemotron_edu_example.py b/examples/classifiers/fineweb_nemotron_edu_example.py new file mode 100644 index 00000000..3073ac30 --- /dev/null +++ b/examples/classifiers/fineweb_nemotron_edu_example.py @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time + +from nemo_curator.classifiers import FineWebNemotronEduClassifier +from nemo_curator.datasets import DocumentDataset +from nemo_curator.utils.distributed_utils import get_client +from nemo_curator.utils.script_utils import ArgumentHelper + + +def main(args): + global_st = time.time() + + # Input can be a string or list + input_file_path = "/path/to/data" + output_file_path = "./" + + client_args = ArgumentHelper.parse_client_args(args) + client_args["cluster_type"] = "gpu" + client = get_client(**client_args) + + input_dataset = DocumentDataset.read_json( + input_file_path, backend="cudf", add_filename=True + ) + + fineweb_nemotron_edu_classifier = FineWebNemotronEduClassifier() + result_dataset = fineweb_nemotron_edu_classifier(dataset=input_dataset) + result_dataset.to_json(output_path=output_file_path, write_to_filename=True) + + global_et = time.time() + print( + f"Total time taken for FineWeb Nemotron-4 Edu Classifier inference: {global_et-global_st} s", + flush=True, + ) + + client.close() + + +def attach_args( + parser=argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ), +): + argumentHelper = ArgumentHelper(parser) + argumentHelper.add_distributed_classifier_cluster_args() + + return argumentHelper.parser + + +if __name__ == "__main__": + main(attach_args().parse_args()) diff --git a/examples/exact_deduplication.py b/examples/exact_deduplication.py index abb44407..130240a0 100644 --- a/examples/exact_deduplication.py +++ b/examples/exact_deduplication.py @@ -17,8 +17,7 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import ExactDuplicates -from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk -from nemo_curator.utils.file_utils import get_all_files_paths_under +from nemo_curator.utils.distributed_utils import get_client, write_to_disk from nemo_curator.utils.script_utils import ArgumentHelper @@ -40,34 +39,33 @@ def main(args): client.run(pre_imports) t0 = time.time() - input_dataset = DocumentDataset.read_json(dataset_dir, backend=backend) + input_dataset = DocumentDataset.read_json( + dataset_dir, backend=backend, blocksize="1GiB", files_per_partition=None + ) exact_dup = ExactDuplicates( logger=log_dir, id_field=id_field, text_field=text_field, + # Decides whether output of the module is deduplicated dataset or duplicates + # If true, you should set cache_dir for performance improvement + perform_removal=False, # cache_dir=output_dir # Optionally write the output to disk ) - duplicates = exact_dup(dataset=input_dataset) + # When perform_removal=False, it will only call .identify_duplicates() and return the list of duplicate IDs. + # When perform_removal=True, then exact_dup outputs the dataset with the duplicates removed. + # It will behave by calling .identify_duplicates() and .remove() in sequence. + duplicates = exact_dup( + dataset=input_dataset + ) # or exact_dup.identify_duplicates(input_dataset) # If caching, result is a path to the output dataset. if isinstance(duplicates, str): duplicates = DocumentDataset.read_parquet(duplicates, backend=backend) - # It's easy to apply dataframe operations to the dataset by using the underlying df. - - # By default all duplicate id's are included in the result - # keep 1 document from each group of duplcates and mark the others to remove - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html - docs_to_remove = duplicates.df.map_partitions( - lambda x: x[x._hashes.duplicated(keep="first")] - ) - - # When there are few duplicates we can compute the results to a list and use `isin`. - result = input_dataset.df[ - ~input_dataset.df[id_field].isin(docs_to_remove[id_field].compute()) - ] + # Remove the duplicates from the input dataset and write to Parquet + result = exact_dup.remove(input_dataset, duplicates) write_to_disk(result, output_dir, output_type="parquet") print(time.time() - t0) diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py index 42663488..da2482bf 100644 --- a/examples/fuzzy_deduplication.py +++ b/examples/fuzzy_deduplication.py @@ -68,6 +68,8 @@ def main(args): cache_dir=cache_dir, id_field=id_field, text_field=text_field, + # Decides whether output of the module is a deduplicated dataset or the IDs of the duplicates + perform_removal=False, seed=42, char_ngrams=24, num_buckets=20, @@ -77,24 +79,20 @@ def main(args): false_positive_check=False, ) fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config) - duplicates = fuzzy_dup(dataset=input_dataset) + + # When perform_removal=False, it will only call .identify_duplicates() and return the list of duplicate IDs. + # When perform_removal=True, then exact_dup outputs the dataset with the duplicates removed. + # It will behave by calling .identify_duplicates() and .remove() in sequence. + duplicates = fuzzy_dup( + dataset=input_dataset + ) # or fuzzy_dup.identify_duplicates(input_dataset) if duplicates is None: print("No duplicates found") print(f"Time taken:{time.time() - t0}s") return - # By default all duplicate id's and the group they belong to are included in the result - # keep 1 document from each group of duplcates and mark the others to remove - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html - docs_to_remove = duplicates.df.map_partitions( - lambda x: x[x.group.duplicated(keep="first")] - ) - - # When there are few duplicates we can compute the results to a list and use `isin`. - result = input_dataset.df[ - ~input_dataset.df[id_field].isin(docs_to_remove[id_field].compute()) - ] + result = fuzzy_dup.remove(input_dataset, duplicates) write_to_disk(result, output_dir, output_type=filetype) print(f"Time taken:{time.time() - t0}s") diff --git a/nemo_curator/classifiers/__init__.py b/nemo_curator/classifiers/__init__.py index 16275e45..b01b5894 100644 --- a/nemo_curator/classifiers/__init__.py +++ b/nemo_curator/classifiers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,11 @@ from .aegis import AegisClassifier, InstructionDataGuardClassifier from .content_type import ContentTypeClassifier from .domain import DomainClassifier, MultilingualDomainClassifier -from .fineweb_edu import FineWebEduClassifier +from .fineweb_edu import ( + FineWebEduClassifier, + FineWebMixtralEduClassifier, + FineWebNemotronEduClassifier, +) from .prompt_task_complexity import PromptTaskComplexityClassifier from .quality import QualityClassifier @@ -29,6 +33,8 @@ "AegisClassifier", "InstructionDataGuardClassifier", "FineWebEduClassifier", + "FineWebMixtralEduClassifier", + "FineWebNemotronEduClassifier", "ContentTypeClassifier", "PromptTaskComplexityClassifier", ] diff --git a/nemo_curator/classifiers/fineweb_edu.py b/nemo_curator/classifiers/fineweb_edu.py index 01799c7a..572c0d74 100644 --- a/nemo_curator/classifiers/fineweb_edu.py +++ b/nemo_curator/classifiers/fineweb_edu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import torch from crossfit import op from crossfit.backend.torch.hf.model import HFModel -from transformers import AutoConfig, AutoModelForSequenceClassification +from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer from nemo_curator.classifiers.base import ( DistributedDataClassifier, @@ -27,6 +27,8 @@ from nemo_curator.datasets import DocumentDataset FINEWEB_EDU_IDENTIFIER = "HuggingFaceFW/fineweb-edu-classifier" +FINEWEB_MIXTRAL_IDENTIFIER = "nvidia/nemocurator-fineweb-mixtral-edu-classifier" +FINEWEB_NEMOTRON_IDENTIFIER = "nvidia/nemocurator-fineweb-nemotron-4-edu-classifier" class FinewebEduModel(HFModel): @@ -63,48 +65,44 @@ def custom_forward(*args, **kwargs): model.forward = custom_forward return model + def load_tokenizer(self): + return AutoTokenizer.from_pretrained(self.path_or_name) + def load_config(self): return AutoConfig.from_pretrained(self.path_or_name) -class FineWebEduClassifier(DistributedDataClassifier): +class _FineWebBaseClassifier(DistributedDataClassifier): """ - FineWebEduClassifier is a specialized classifier designed for educational content assessment, - utilizing the Hugging Face FineWeb EDU Classifier model (https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier). - This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large text datasets. - - Attributes: - batch_size (int): The number of samples per batch for inference. Defaults to 256. - text_field (str): The column name containing the text data to be classified. Defaults to "text". - pred_column (str): The column name where prediction scores will be stored. Defaults to "fineweb-edu-score". - int_column (str): The column name where integer-rounded prediction scores will be stored. Defaults to "fineweb-edu-score-int". - max_chars (int): The maximum number of characters in each document to consider for classification. If -1, the entire document is considered. Defaults to -1. - device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". - autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. - max_mem_gb (int, optional): The maximum amount of memory in GB to allocate for the model. If None, - it defaults to the available GPU memory minus 4 GB. - + Parent class for FineWebEduClassifier, FineWebMixtralEduClassifier, and FineWebNemotronEduClassifier, + since their implementations are almost identical. """ def __init__( self, - batch_size: int = 256, + fineweb_identifier: str, + pred_column: str, + int_column: str, + quality_label_column: Optional[str], + batch_size: int = 1024, text_field: str = "text", - pred_column: str = "fineweb-edu-score", - int_column="fineweb-edu-score-int", max_chars: int = -1, device_type: str = "cuda", autocast: bool = True, max_mem_gb: Optional[int] = None, ): + self.fineweb_identifier = fineweb_identifier + model = FinewebEduModel( - path_or_name=FINEWEB_EDU_IDENTIFIER, + path_or_name=fineweb_identifier, autocast=autocast, max_mem_gb=max_mem_gb, ) self.text_field = text_field self.int_column = int_column + self.quality_label_column = quality_label_column + super().__init__( model=model, filter_by=None, # No filtering as its a numeric score @@ -118,14 +116,20 @@ def __init__( ) def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset: - print("Starting Fineweb EDU classifier inference", flush=True) + if self.fineweb_identifier == FINEWEB_EDU_IDENTIFIER: + print("Starting FineWeb-Edu Classifier inference", flush=True) + elif self.fineweb_identifier == FINEWEB_MIXTRAL_IDENTIFIER: + print("Starting FineWeb Mixtral Edu Classifier inference", flush=True) + elif self.fineweb_identifier == FINEWEB_NEMOTRON_IDENTIFIER: + print("Starting FineWeb Nemotron-4 Edu Classifier inference", flush=True) + ddf = dataset.df pipe = op.Sequential( op.Tokenizer( self.model, cols=[self.text_field], - tokenizer_type="sentencepiece", + tokenizer_type="default", max_length=self.model.max_seq_length(), ), op.Predictor( @@ -137,6 +141,7 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset: keep_cols=ddf.columns.tolist(), ) ddf = pipe(ddf) + ddf[self.pred_column] = ddf[self.pred_column].where( ddf[self.pred_column] >= 0, 0 ) @@ -144,4 +149,150 @@ def _run_classifier(self, dataset: DocumentDataset) -> DocumentDataset: ddf[self.pred_column] <= 5, 5 ) ddf[self.int_column] = ddf[self.pred_column].round().astype(int) + + if self.quality_label_column is not None: + ddf[self.quality_label_column] = "high_quality" + # If the score is less than 2.5, label it as low quality + ddf[self.quality_label_column] = ddf[self.quality_label_column].mask( + ddf[self.pred_column] < 2.5, "low_quality" + ) + return DocumentDataset(ddf) + + +class FineWebEduClassifier(_FineWebBaseClassifier): + """ + FineWebEduClassifier is a specialized classifier designed for educational content assessment, + utilizing the Hugging Face FineWeb EDU Classifier model (https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier). + This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large text datasets. + + Attributes: + batch_size (int): The number of samples per batch for inference. Defaults to 256. + text_field (str): The column name containing the text data to be classified. Defaults to "text". + pred_column (str): The column name where prediction scores will be stored. Defaults to "fineweb-edu-score". + int_column (str): The column name where integer-rounded prediction scores will be stored. Defaults to "fineweb-edu-score-int". + max_chars (int): The maximum number of characters in each document to consider for classification. If -1, the entire document is considered. Defaults to -1. + device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". + autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. + max_mem_gb (int, optional): The maximum amount of memory in GB to allocate for the model. If None, + it defaults to the available GPU memory minus 4 GB. + + """ + + def __init__( + self, + batch_size: int = 256, + text_field: str = "text", + pred_column: str = "fineweb-edu-score", + int_column="fineweb-edu-score-int", + max_chars: int = -1, + device_type: str = "cuda", + autocast: bool = True, + max_mem_gb: Optional[int] = None, + ): + super().__init__( + fineweb_identifier=FINEWEB_EDU_IDENTIFIER, + batch_size=batch_size, + text_field=text_field, + pred_column=pred_column, + int_column=int_column, + quality_label_column=None, + max_chars=max_chars, + device_type=device_type, + autocast=autocast, + max_mem_gb=max_mem_gb, + ) + + +class FineWebMixtralEduClassifier(_FineWebBaseClassifier): + """ + FineWebMixtralEduClassifier is a specialized classifier designed for educational content assessment, + utilizing the NemoCurator FineWeb Mixtral Edu Classifier model (https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier). + It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Mixtral 8x22B-Instruct. + This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large text datasets. + + Attributes: + batch_size (int): The number of samples per batch for inference. Defaults to 256. + text_field (str): The column name containing the text data to be classified. Defaults to "text". + pred_column (str): The column name where prediction scores will be stored. Defaults to "fineweb-mixtral-edu-score". + int_column (str): The column name where integer-rounded prediction scores will be stored. Defaults to "fineweb-mixtral-edu-score-int". + quality_label_column (str): The column name where a score of >= 2.5 is labeled "high_quality" and otherwise labeled "low_quality". Defaults to "fineweb-mixtral-edu-score-label". + max_chars (int): The maximum number of characters in each document to consider for classification. If -1, the entire document is considered. Defaults to -1. + device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". + autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. + max_mem_gb (int, optional): The maximum amount of memory in GB to allocate for the model. If None, + it defaults to the available GPU memory minus 4 GB. + + """ + + def __init__( + self, + batch_size: int = 1024, + text_field: str = "text", + pred_column: str = "fineweb-mixtral-edu-score", + int_column: str = "fineweb-mixtral-edu-score-int", + quality_label_column: str = "fineweb-mixtral-edu-score-label", + max_chars: int = -1, + device_type: str = "cuda", + autocast: bool = True, + max_mem_gb: Optional[int] = None, + ): + super().__init__( + fineweb_identifier=FINEWEB_MIXTRAL_IDENTIFIER, + batch_size=batch_size, + text_field=text_field, + pred_column=pred_column, + int_column=int_column, + quality_label_column=quality_label_column, + max_chars=max_chars, + device_type=device_type, + autocast=autocast, + max_mem_gb=max_mem_gb, + ) + + +class FineWebNemotronEduClassifier(_FineWebBaseClassifier): + """ + FineWebNemotronEduClassifier is a specialized classifier designed for educational content assessment, + utilizing the NemoCurator FineWeb Nemotron-4 Edu Classifier model (https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier). + It is similar to the FineWeb-Edu classifier and was trained on the same text samples, but using annotations from Nemotron-4-340B-Instruct. + This classifier is optimized for running on multi-node, multi-GPU setups to enable fast and efficient inference on large text datasets. + + Attributes: + batch_size (int): The number of samples per batch for inference. Defaults to 256. + text_field (str): The column name containing the text data to be classified. Defaults to "text". + pred_column (str): The column name where prediction scores will be stored. Defaults to "fineweb-nemotron-edu-score". + int_column (str): The column name where integer-rounded prediction scores will be stored. Defaults to "fineweb-nemotron-edu-score-int". + quality_label_column (str): The column name where a score of >= 2.5 is labeled "high_quality" and otherwise labeled "low_quality". Defaults to "fineweb-nemotron-edu-score-label". + max_chars (int): The maximum number of characters in each document to consider for classification. If -1, the entire document is considered. Defaults to -1. + device_type (str): The type of device to use for inference, either "cuda" or "cpu". Defaults to "cuda". + autocast (bool): Whether to use mixed precision for faster inference. Defaults to True. + max_mem_gb (int, optional): The maximum amount of memory in GB to allocate for the model. If None, + it defaults to the available GPU memory minus 4 GB. + + """ + + def __init__( + self, + batch_size: int = 1024, + text_field: str = "text", + pred_column: str = "fineweb-nemotron-edu-score", + int_column: str = "fineweb-nemotron-edu-score-int", + quality_label_column: str = "fineweb-nemotron-edu-score-label", + max_chars: int = -1, + device_type: str = "cuda", + autocast: bool = True, + max_mem_gb: Optional[int] = None, + ): + super().__init__( + fineweb_identifier=FINEWEB_NEMOTRON_IDENTIFIER, + batch_size=batch_size, + text_field=text_field, + pred_column=pred_column, + int_column=int_column, + quality_label_column=quality_label_column, + max_chars=max_chars, + device_type=device_type, + autocast=autocast, + max_mem_gb=max_mem_gb, + ) diff --git a/nemo_curator/filters/__init__.py b/nemo_curator/filters/__init__.py index 9905c837..cda290fb 100644 --- a/nemo_curator/filters/__init__.py +++ b/nemo_curator/filters/__init__.py @@ -49,7 +49,9 @@ RepeatedParagraphsFilter, RepeatingDuplicateNGramsFilter, RepeatingTopNGramsFilter, + SubstringFilter, SymbolsToWordsFilter, + TokenCountFilter, UrlsFilter, WhiteSpaceFilter, WordCountFilter, @@ -98,4 +100,6 @@ "QualityEstimationFilter", "AnswerabilityFilter", "EasinessFilter", + "TokenCountFilter", + "SubstringFilter", ] diff --git a/nemo_curator/filters/heuristic_filter.py b/nemo_curator/filters/heuristic_filter.py index c17e4e9a..26617bd6 100644 --- a/nemo_curator/filters/heuristic_filter.py +++ b/nemo_curator/filters/heuristic_filter.py @@ -14,9 +14,11 @@ import os.path import tarfile +from typing import Literal import requests from platformdirs import user_cache_dir +from transformers import AutoTokenizer from nemo_curator.filters.bitext_filter import BitextFilter from nemo_curator.filters.doc_filter import DocumentFilter, import_filter @@ -671,6 +673,66 @@ def keep_document(self, score): return score != 1 +class TokenCountFilter(DocumentFilter): + """ + If the document contains more or less than a specified number of tokens, then discard. + """ + + def __init__(self, tokenizer: AutoTokenizer, min_tokens=0, max_tokens=float("inf")): + """ + Args: + tokenizer (AutoTokenizer): The tokenizer to use to count the tokens. + min_tokens (int): The minimum number of tokens the document must contain. + Set to 0 to disable the minimum token count filter. + max_tokens (int): The maximum number of tokens the document can contain. + Set to infinity to disable the maximum token count filter. + """ + super().__init__() + self._tokenizer = tokenizer + self._min_tokens = min_tokens + self._max_tokens = max_tokens + self._name = "token_count" + + def score_document(self, text: str) -> int: + tokens = self._tokenizer.encode(text) + return len(tokens) + + def keep_document(self, score: int) -> bool: + return self._min_tokens <= score <= self._max_tokens + + +class SubstringFilter(DocumentFilter): + """ + Keeps documents that contain a substring in a given position. + Gives a score of 1 if the substring is found in the given position, otherwise 0. + """ + + def __init__(self, substring: str, position: Literal["prefix", "suffix", "any"]): + """ + Args: + substring (str): The substring to check for. + position (Literal["prefix", "suffix", "any"]): The position of the substring. + """ + super().__init__() + self._substring = substring + if position not in ["prefix", "suffix", "any"]: + raise ValueError( + f"Invalid position: {position}. Must be one of: prefix, suffix, any." + ) + self._position = position + + def score_document(self, text: str) -> int: + if self._position == "prefix": + return int(text.startswith(self._substring)) + elif self._position == "suffix": + return int(text.endswith(self._substring)) + elif self._position == "any": + return int(self._substring in text) + + def keep_document(self, score: int) -> bool: + return score == 1 + + class HistogramFilter(DocumentFilter): """Histogram filter used by the NLLB paper (https://arxiv.org/pdf/2207.04672). See p30 for details. diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py index e4b9a62a..3be9bedf 100644 --- a/nemo_curator/modifiers/__init__.py +++ b/nemo_curator/modifiers/__init__.py @@ -15,8 +15,12 @@ from .c4 import BoilerPlateStringModifier from .doc_modifier import DocumentModifier from .fasttext import FastTextLabelModifier +from .line_remover import LineRemover +from .markdown_remover import MarkdownRemover from .newline_normalizer import NewlineNormalizer from .pii_modifier import PiiModifier +from .quotation_remover import QuotationRemover +from .slicer import Slicer from .unicode_reformatter import UnicodeReformatter from .url_remover import UrlRemover @@ -25,7 +29,11 @@ "BoilerPlateStringModifier", "FastTextLabelModifier", "UnicodeReformatter", + "QuotationRemover", + "LineRemover", + "MarkdownRemover", "PiiModifier", "NewlineNormalizer", "UrlRemover", + "Slicer", ] diff --git a/nemo_curator/modifiers/line_remover.py b/nemo_curator/modifiers/line_remover.py new file mode 100644 index 00000000..eab763ad --- /dev/null +++ b/nemo_curator/modifiers/line_remover.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from nemo_curator.modifiers import DocumentModifier + + +class LineRemover(DocumentModifier): + """ + Removes lines from a document if the content of the line matches a given string. + """ + + def __init__(self, patterns: List[str]): + """ + Args: + patterns (List[str]): The patterns to check + """ + super().__init__() + self._patterns = patterns + + def modify_document(self, text: str) -> str: + lines = text.split("\n") + new_lines = [line for line in lines if line not in self._patterns] + return "\n".join(new_lines) diff --git a/nemo_curator/modifiers/markdown_remover.py b/nemo_curator/modifiers/markdown_remover.py new file mode 100644 index 00000000..be060fd4 --- /dev/null +++ b/nemo_curator/modifiers/markdown_remover.py @@ -0,0 +1,43 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +from nemo_curator.modifiers import DocumentModifier + +MARKDOWN_BOLD_REGEX = r"\*\*(.*?)\*\*" +MARKDOWN_ITALIC_REGEX = r"\*(.*?)\*" +MARKDOWN_UNDERLINE_REGEX = r"_(.*?)_" +MARKDOWN_LINK_REGEX = r"\[.*?\]\((.*?)\)" + + +class MarkdownRemover(DocumentModifier): + """ + Removes Markdown formatting in a document including bold, italic, underline, and URL text. + """ + + def __init__(self): + super().__init__() + + def modify_document(self, text: str) -> str: + lines = text.split("\n") + new_lines = [] + for line in lines: + line = re.sub(MARKDOWN_BOLD_REGEX, r"\1", line) # **text** + line = re.sub(MARKDOWN_ITALIC_REGEX, r"\1", line) # *text* + line = re.sub(MARKDOWN_UNDERLINE_REGEX, r"\1", line) # _text_ + line = re.sub(MARKDOWN_LINK_REGEX, r"\1", line) # [text](url) + new_lines.append(line) + + return "\n".join(new_lines) diff --git a/nemo_curator/modifiers/quotation_remover.py b/nemo_curator/modifiers/quotation_remover.py new file mode 100644 index 00000000..02f5bda1 --- /dev/null +++ b/nemo_curator/modifiers/quotation_remover.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers import DocumentModifier + + +class QuotationRemover(DocumentModifier): + """ + Removes quotations from a document following a few rules: + - If the document is less than 2 characters, it is returned unchanged. + - If the document starts and ends with a quotation mark and there are + no newlines in the document, the quotation marks are removed. + - If the document starts and ends with a quotation mark and there are + newlines in the document, the quotation marks are removed only if + the first line does not end with a quotation mark. + """ + + def __init__(self): + super().__init__() + + def modify_document(self, text: str) -> str: + if len(text.strip()) > 2 and text[0] == '"' and text[-1] == '"': + if "\n" not in text.strip(): + text = text[1:-1] + elif text.split("\n")[0][-1] != '"': + text = text[1:-1] + return text diff --git a/nemo_curator/modifiers/slicer.py b/nemo_curator/modifiers/slicer.py new file mode 100644 index 00000000..d267b831 --- /dev/null +++ b/nemo_curator/modifiers/slicer.py @@ -0,0 +1,86 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional, Union + +from nemo_curator.modifiers import DocumentModifier + + +class Slicer(DocumentModifier): + """ + Slices a document based on indices or strings. + """ + + def __init__( + self, + left: Optional[Union[int, str]] = 0, + right: Optional[Union[int, str]] = None, + include_left: bool = True, + include_right: bool = True, + strip: bool = True, + ): + """ + Args: + left (Union[int, str], optional): If the provided value is an int, slice the string from this index (inclusive). + If the provided value is a str, slice the string from the first occurence of this substring. + right (Union[int, str], optional): If the provided value is an int, slice the string to this index (exclusive). + If the provided value is a str, slice the string to the last occurence of this substring. If None, + right is set to the length of the string. + include_left (bool): Only used if `left` is a string. If True, the value of `left` is included in the + slicing result. Defaults to False. + include_right (bool): Only used if `right` is a string. If True, the value of `right` is included in the + slicing result. Defaults to False. + strip (bool): If True, strip the resulting string. + """ + super().__init__() + self._left = left + self._right = right + self._include_left = include_left + self._include_right = include_right + self._strip = strip + + def modify_document(self, text: str) -> str: + # Determine start index based on left type + if isinstance(self._left, int): + left_index = self._left + elif isinstance(self._left, str): + left_index_found = text.find(self._left) + if left_index_found == -1: + return "" + left_index = ( + left_index_found + if self._include_left + else left_index_found + len(self._left) + ) + else: + left_index = 0 # default if neither int nor str + + # Determine end index based on right type + if isinstance(self._right, int): + right_index = self._right + elif isinstance(self._right, str): + right_index_found = text.rfind(self._right) + if right_index_found == -1: + return "" + right_index = ( + right_index_found + len(self._right) + if self._include_right + else right_index_found + ) + else: + right_index = len(text) # default if neither int nor str + + result = text[left_index:right_index] + if self._strip: + result = result.strip() + return result diff --git a/nemo_curator/modules/__init__.py b/nemo_curator/modules/__init__.py index 54e0377b..6273c88e 100644 --- a/nemo_curator/modules/__init__.py +++ b/nemo_curator/modules/__init__.py @@ -28,6 +28,8 @@ from .exact_dedup import ExactDuplicates from .meta import Sequential from .modify import Modify +from .splitter import DocumentSplitter +from .joiner import DocumentJoiner from .task import TaskDecontamination from .to_backend import ToBackend @@ -92,4 +94,6 @@ "SemDedup", "BaseModule", "ToBackend", + "DocumentSplitter", + "DocumentJoiner", ] diff --git a/nemo_curator/modules/add_id.py b/nemo_curator/modules/add_id.py index 08c627f5..f8827176 100644 --- a/nemo_curator/modules/add_id.py +++ b/nemo_curator/modules/add_id.py @@ -27,7 +27,7 @@ class AddId(BaseModule): def __init__( self, id_field, id_prefix: str = "doc_id", start_index: Optional[int] = None ) -> None: - super().__init__(input_backend="pandas") + super().__init__(input_backend="any") self.id_field = id_field self.id_prefix = id_prefix self.start_index = start_index diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index 50c71017..67bf06af 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -44,6 +44,8 @@ class FuzzyDuplicatesConfig(BaseConfig): but might lead to memory pressures and related errors. id_field: Column in the Dataset denoting document ID. text_field: Column in the Dataset denoting document content. + perform_removal: Boolean value to specify whether calling the module should remove the duplicates from + the original dataset, or return the list of IDs denoting duplicates. profile_dir: str, Default None If specified directory to write dask profile cache_dir: str, Default None @@ -64,6 +66,7 @@ class FuzzyDuplicatesConfig(BaseConfig): profile_dir: Optional[str] = None id_field: str = "id" text_field: str = "text" + perform_removal: bool = False # Minhash + LSH Config seed: int = 42 @@ -131,6 +134,11 @@ def __post_init__(self): if not 1 <= self.buckets_per_shuffle <= self.num_buckets: raise ValueError("Buckets per shuffle must be between [1, num_buckets]") + if not self.perform_removal: + warnings.warn( + "In future releases (starting with 0.8.0) the default will be True." + ) + @dataclass class SemDedupConfig(BaseConfig): diff --git a/nemo_curator/modules/exact_dedup.py b/nemo_curator/modules/exact_dedup.py index b98051ca..fc50b6a0 100644 --- a/nemo_curator/modules/exact_dedup.py +++ b/nemo_curator/modules/exact_dedup.py @@ -18,7 +18,6 @@ import time import warnings from contextlib import nullcontext -from datetime import datetime from hashlib import md5 from typing import Optional, Union @@ -31,6 +30,7 @@ from nemo_curator.log import create_logger from nemo_curator.modules.base import BaseModule from nemo_curator.utils.distributed_utils import performance_report_if_with_ts_suffix +from nemo_curator.utils.duplicates_removal import remove_duplicates from nemo_curator.utils.gpu_utils import is_cudf_type @@ -45,6 +45,7 @@ def __init__( id_field: str = "id", text_field: str = "text", hash_method: str = "md5", + perform_removal: bool = False, profile_dir: Optional[str] = None, cache_dir: Optional[str] = None, ): @@ -66,9 +67,17 @@ def __init__( raise ValueError( f"{hash_method} not in supported hash_methods. Choose a hash_method from {self.SUPPORTED_HASHES}" ) + self.hash_method = hash_method self.id_field = id_field self.text_field = text_field + self.perform_removal = perform_removal + if not self.perform_removal: + warnings.warn( + "In future releases (starting with 0.8.0) the default will be True." + ) + if self.perform_removal and cache_dir is None: + warnings.warn("cache_dir is recommended to remove duplicates.") if cache_dir is None and profile_dir is not None: warnings.warn( "cache_dir for intermediate outputs is required to generate profiles" @@ -137,7 +146,7 @@ def hash_documents( # TODO: Generalize ty using self.hash_method return df.apply(lambda x: md5(x.encode()).hexdigest()) - def call(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]: + def identify_duplicates(self, dataset: DocumentDataset) -> DocumentDataset: """ Find document ID's for exact duplicates in a given DocumentDataset Parameters @@ -168,10 +177,38 @@ def call(self, dataset: DocumentDataset) -> Union[DocumentDataset, str]: self._logger.info( f"Time taken for Exact Dedup Computation = {time.time() - t0}s and output written at {write_path}" ) - if is_cudf_type(result): - import dask_cudf + backend = "cudf" if is_cudf_type(result) else "pandas" + return DocumentDataset.read_parquet( + write_path, + backend=backend, + # We read with files_per_partition=1 so that groups are read in whole (and do not exist across partitions) + files_per_partition=1, + blocksize=None, + ) - result_dataset = dask_cudf.read_parquet(write_path, split_row_groups=False) - else: - result_dataset = dd.read_parquet(write_path) - return DocumentDataset(result_dataset) + def remove( + self, dataset: DocumentDataset, duplicates_to_remove: Optional[DocumentDataset] + ) -> DocumentDataset: + """ + Remove exact duplicates from a given DocumentDataset + Parameters + ---------- + dataset: DocumentDataset + The input datset to remove exact duplicates + Returns + ------- + DocumentDataset containing only non-duplicate documents + """ + result = remove_duplicates( + left=dataset.df, + duplicates=duplicates_to_remove.df, + id_field=self.id_field, + group_field="_hashes", + ) + return DocumentDataset(result) + + def call(self, dataset: DocumentDataset) -> DocumentDataset: + duplicates = self.identify_duplicates(dataset) + if self.perform_removal: + return self.remove(dataset, duplicates) + return duplicates diff --git a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py index 6125f664..00e5482f 100644 --- a/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py +++ b/nemo_curator/modules/fuzzy_dedup/fuzzyduplicates.py @@ -17,9 +17,7 @@ import logging import os import time -from typing import Union - -import dask_cudf +from typing import Optional, Union from nemo_curator.datasets import DocumentDataset from nemo_curator.log import create_logger @@ -34,6 +32,7 @@ from nemo_curator.modules.fuzzy_dedup.minhash import MinHash from nemo_curator.modules.meta import Sequential from nemo_curator.utils.distributed_utils import performance_report_if_with_ts_suffix +from nemo_curator.utils.duplicates_removal import remove_duplicates class FuzzyDuplicates(BaseModule): @@ -65,6 +64,7 @@ def __init__( self._logger = logger self.config = config + self.minhash = MinHash( seed=self.config.seed, num_hashes=self.config.num_hashes, @@ -131,7 +131,9 @@ def __init__( profile_dir=self.config.profile_dir, ) - def call(self, dataset: DocumentDataset): + def identify_duplicates( + self, dataset: DocumentDataset + ) -> Optional[DocumentDataset]: """ Parameters ---------- @@ -245,4 +247,41 @@ def call(self, dataset: DocumentDataset): print(f"Stage {stage_num}: Connected Components across buckets complete!") stage_num += 1 - return DocumentDataset(dask_cudf.read_parquet(cc_path, split_row_groups=False)) + return DocumentDataset.read_parquet( + cc_path, + backend="cudf", + # We read with files_per_partition=1 so that groups are read in whole (and do not exist across partitions) + files_per_partition=1, + blocksize=None, + ) + + def remove( + self, dataset: DocumentDataset, duplicates_to_remove: Optional[DocumentDataset] + ) -> Optional[DocumentDataset]: + """ + Remove exact duplicates from a given DocumentDataset + Parameters + ---------- + dataset: DocumentDataset + The input datset to remove exact duplicates + Returns + ------- + DocumentDataset containing only non-duplicate documents + """ + if not duplicates_to_remove: + return None + result = remove_duplicates( + left=dataset.df, + duplicates=duplicates_to_remove.df, + id_field=self.config.id_field, + group_field="group", + ) + return DocumentDataset(result) + + def call( + self, dataset: DocumentDataset, perform_removal: bool = False + ) -> DocumentDataset: + duplicates = self.identify_duplicates(dataset) + if perform_removal: + return self.remove(dataset, duplicates) + return duplicates diff --git a/nemo_curator/modules/joiner.py b/nemo_curator/modules/joiner.py new file mode 100644 index 00000000..2ecdfc80 --- /dev/null +++ b/nemo_curator/modules/joiner.py @@ -0,0 +1,168 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Optional + +import pandas as pd + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules.base import BaseModule + + +class DocumentJoiner(BaseModule): + """ + Joins documents that have a common id back into a single document. + The order of the documents is dictated by an additional segment_id column. + A maximum length can be specified to limit the size of the joined documents. + + The joined documents are joined by a separator. + """ + + def __init__( + self, + separator: str, + text_field: str = "text", + segment_id_field: str = "segment_id", + document_id_field: str = "id", + drop_segment_id_field: bool = True, + max_length: Optional[int] = None, + length_field: Optional[str] = None, + ): + """ + Args: + separator (str): The separator to join the documents on. + text_field (str): The name of the column containing the text to join. + segment_id_field (str): The name of the column containing the segment id. + document_id_field (str): The name of the column containing the document id. + drop_segment_id_field (bool): Whether to drop the segment_id_field after joining. + max_length (int, optional): The maximum length of the joined documents. + Both max_length and length_field must be specified or neither can be specified. + length_field (str, optional): The name of the column containing the length of the documents. + Both max_length and length_field must be specified or neither can be specified. + """ + if max_length is not None and length_field is None: + raise ValueError("max_length is specified but length_field is not") + if max_length is None and length_field is not None: + raise ValueError("length_field is specified but max_length is not") + + super().__init__(input_backend="pandas") + self.separator = separator + self.text_field = text_field + self.segment_id_field = segment_id_field + self.document_id_field = document_id_field + self.drop_segment_id_field = drop_segment_id_field + self.max_length = max_length + self.length_field = length_field + + def _join_segments(self, group): + # Ensure segments are processed in order. + group = group.sort_values(self.segment_id_field) + joined_rows = [] + current_seg_id = 0 + accumulator_text = None + accumulator_length = 0 + accumulator_row = None + + for _, row in group.iterrows(): + if accumulator_row is None: + # Start a new accumulation with the first segment. + accumulator_text = row[self.text_field] + accumulator_length = row[self.length_field] + accumulator_row = row + else: + # Calculate what the new length would be if we joined this segment. + proposed_length = accumulator_length + row[self.length_field] + 1 + if proposed_length <= self.max_length: + accumulator_text = ( + accumulator_text + self.separator + row[self.text_field] + ) + accumulator_length = proposed_length + else: + # Commit the current accumulation as one joined segment. + new_row = accumulator_row.copy() + new_row[self.text_field] = accumulator_text + new_row[self.length_field] = accumulator_length + new_row[self.segment_id_field] = current_seg_id + joined_rows.append(new_row) + current_seg_id += 1 + # Start a new accumulation with the current row. + accumulator_text = row[self.text_field] + accumulator_length = row[self.length_field] + accumulator_row = row + + # Commit the last accumulated segment. + if accumulator_row is not None: + new_row = accumulator_row.copy() + new_row[self.text_field] = accumulator_text + new_row[self.length_field] = accumulator_length + new_row[self.segment_id_field] = current_seg_id + joined_rows.append(new_row) + if joined_rows: + return pd.concat( + [group.iloc[0:0], pd.DataFrame(joined_rows)], ignore_index=True + ) + else: + return group.iloc[0:0] + + def _join_partition( + self, df: pd.DataFrame, expected_cols: List[str] + ) -> pd.DataFrame: + if df.empty: + return df + + if self.max_length is None: + # Sort the segments by the segment_id_field to maintain proper order before aggregating. + df_sorted = df.sort_values(self.segment_id_field) + # Build aggregation functions to preserve all original columns: + # - For self.text_field, join all segments using the separator. + # - For all other columns (except self.document_id_field, which is our grouping key), take the first occurrence. + agg_funcs = {} + for col in df_sorted.columns: + if col == self.text_field: + agg_funcs[col] = lambda texts: self.separator.join( + texts.astype(str) + ) + elif col != self.document_id_field: + agg_funcs[col] = "first" + # Group by document_id_field while keeping the key as a column. + joined = df_sorted.groupby(self.document_id_field, as_index=False).agg( + agg_funcs + ) + else: + joined = df.groupby(self.document_id_field, group_keys=False).apply( + self._join_segments + ) + + if self.drop_segment_id_field: + joined = joined.drop(columns=self.segment_id_field) + # Reorder the columns to match the expected metadata order. + joined = joined[expected_cols] + return joined + + def call(self, dataset: DocumentDataset) -> DocumentDataset: + """ + Joins the documents back into a single document while preserving all the original fields. + """ + # Construct meta information for the transformed dataframe. + meta = dataset.df._meta.copy() + if self.text_field not in meta.columns: + meta[self.text_field] = pd.Series(dtype="object") + # If dropping the segment id field, remove it from the metadata to prevent mismatches. + if self.drop_segment_id_field: + meta = meta.drop(columns=self.segment_id_field) + expected_cols = list(meta.columns) + # Apply the join operation partition-wise. + dataset.df = dataset.df.map_partitions( + self._join_partition, expected_cols=expected_cols, meta=meta + ) + return dataset diff --git a/nemo_curator/modules/splitter.py b/nemo_curator/modules/splitter.py new file mode 100644 index 00000000..623b4d4a --- /dev/null +++ b/nemo_curator/modules/splitter.py @@ -0,0 +1,80 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Union + +import pandas as pd + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modules.base import BaseModule +from nemo_curator.utils.import_utils import gpu_only_import + +cudf = gpu_only_import("cudf") + + +class DocumentSplitter(BaseModule): + """ + Splits documents into segments based on a separator. + Each segment is a new document with an additional column + indicating the segment id. + + To restore the original document, ensure that each document + has a unique id prior to splitting. + """ + + def __init__( + self, + separator: str, + text_field: str = "text", + segment_id_field: str = "segment_id", + ): + """ + Args: + separator (str): The separator to split the documents on. + text_field (str): The name of the column containing the text to split. + segment_id_field (str): The name of the column to add to indicate the segment id. + """ + super().__init__(input_backend="any") + self.separator = separator + self.text_field = text_field + self.segment_id_field = segment_id_field + + def _split_partition( + self, df: Union[pd.DataFrame, "cudf.DataFrame"] + ) -> Union[pd.DataFrame, "cudf.DataFrame"]: + # Split the text field into segments using the separator. + df["split_text"] = df[self.text_field].str.split(self.separator) + # Explode the list so that each segment becomes a separate row. + df = df.explode("split_text") + # For each original document (grouped by the original index), assign a segment id. + df[self.segment_id_field] = df.groupby(level=0).cumcount() + # Replace the original text field with the split segment. + df[self.text_field] = df["split_text"] + # Drop the temporary column. + df = df.drop(columns="split_text") + return df + + def call(self, dataset: DocumentDataset) -> DocumentDataset: + """ + Splits the documents into segments based on the separator and + adds a column indicating the segment id. + """ + + # Construct meta information for the transformed dataframe. + meta = dataset.df._meta.copy() + if self.segment_id_field not in meta.columns: + meta[self.segment_id_field] = pd.Series(dtype="int64") + + # Apply the partition-wise splitting transformation using Dask's map_partitions. + dataset.df = dataset.df.map_partitions(self._split_partition, meta=meta) + return dataset diff --git a/nemo_curator/package_info.py b/nemo_curator/package_info.py index 26f99e5f..3ebbf9de 100644 --- a/nemo_curator/package_info.py +++ b/nemo_curator/package_info.py @@ -14,9 +14,9 @@ MAJOR = 0 -MINOR = 7 +MINOR = 8 PATCH = 0 -PRE_RELEASE = "rc1" +PRE_RELEASE = "rc0" DEV = "dev0" # Use the following formatting: (major, minor, patch, pre-release) diff --git a/nemo_curator/scripts/classifiers/README.md b/nemo_curator/scripts/classifiers/README.md index 59197474..fd30e6d3 100644 --- a/nemo_curator/scripts/classifiers/README.md +++ b/nemo_curator/scripts/classifiers/README.md @@ -8,6 +8,8 @@ The Python scripts in this directory demonstrate how to run classification on yo - AEGIS Safety Models - Instruction Data Guard Model - FineWeb Educational Content Classifier +- FineWeb Mixtral Educational Classifier +- FineWeb Nemotron-4 Educational Classifier - Content Type Classifier - Prompt Task and Complexity Classifier @@ -139,6 +141,44 @@ fineweb_edu_classifier_inference \ Additional arguments may be added for customizing a Dask cluster and client. Run `fineweb_edu_classifier_inference --help` for more information. +#### FineWeb Mixtral Edu Classifier Inference + +```bash +# same as `python fineweb_mixtral_edu_classifier_inference.py` +fineweb_mixtral_edu_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --autocast \ + --max-chars 2000 \ + --device "gpu" +``` + +Additional arguments may be added for customizing a Dask cluster and client. Run `fineweb_mixtral_edu_classifier_inference --help` for more information. + +#### FineWeb Nemotron-4 Edu Classifier Inference + +```bash +# same as `python fineweb_nemotron_edu_classifier_inference.py` +fineweb_nemotron_edu_classifier_inference \ + --input-data-dir /path/to/data/directory \ + --output-data-dir /path/to/output/directory \ + --input-file-type "jsonl" \ + --input-file-extension "jsonl" \ + --output-file-type "jsonl" \ + --input-text-field "text" \ + --batch-size 64 \ + --autocast \ + --max-chars 2000 \ + --device "gpu" +``` + +Additional arguments may be added for customizing a Dask cluster and client. Run `fineweb_nemotron_edu_classifier_inference --help` for more information. + #### Content Type Classifier DeBERTa Inference ```bash diff --git a/nemo_curator/scripts/classifiers/fineweb_mixtral_edu_classifier_inference.py b/nemo_curator/scripts/classifiers/fineweb_mixtral_edu_classifier_inference.py new file mode 100644 index 00000000..582ec4c5 --- /dev/null +++ b/nemo_curator/scripts/classifiers/fineweb_mixtral_edu_classifier_inference.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import warnings + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" + +from nemo_curator.classifiers import FineWebMixtralEduClassifier +from nemo_curator.datasets import DocumentDataset + +# Get relevant args +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.file_utils import get_remaining_files +from nemo_curator.utils.script_utils import ArgumentHelper + +warnings.filterwarnings("ignore") + + +def main(): + args = ArgumentHelper.parse_distributed_classifier_args( + description="Run FineWeb Mixtral Edu Classifier inference." + ).parse_args() + print(f"Arguments parsed = {args}", flush=True) + + client_args = ArgumentHelper.parse_client_args(args) + client_args["cluster_type"] = "gpu" + client = get_client(**client_args) + print("Starting FineWeb Mixtral Edu Classifier inference", flush=True) + global_st = time.time() + files_per_run = len(client.scheduler_info()["workers"]) * 2 + + if not os.path.exists(args.output_data_dir): + os.makedirs(args.output_data_dir) + + # Some times jsonl files are stored as .json + # So to handle that case we can pass the input_file_extension + if args.input_file_extension is not None: + input_file_extension = args.input_file_extension + else: + input_file_extension = args.input_file_type + + input_files = get_remaining_files( + args.input_data_dir, args.output_data_dir, input_file_extension + ) + print(f"Total input files {len(input_files)}", flush=True) + + if args.input_file_type == "pickle": + add_filename = False + else: + add_filename = True + + fineweb_mixtral_edu_classifier = FineWebMixtralEduClassifier( + text_field=args.input_text_field, + batch_size=args.batch_size, + autocast=args.autocast, + max_chars=args.max_chars, + max_mem_gb=args.max_mem_gb_classifier, + ) + + for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): + batch_st = time.time() + current_batch_files = input_files[i : i + files_per_run] + print( + f"File Batch ID {file_batch_id}: total input files {len(current_batch_files)}", + flush=True, + ) + df = read_data( + input_files=current_batch_files, + file_type=args.input_file_type, + add_filename=add_filename, + ) + df = fineweb_mixtral_edu_classifier(DocumentDataset(df)).df + print(f"Total input Dask DataFrame partitions {df.npartitions}", flush=True) + + write_to_disk( + df=df, + output_path=args.output_data_dir, + write_to_filename=add_filename, + output_type=args.output_file_type, + ) + batch_et = time.time() + print( + f"File Batch ID {file_batch_id}: completed in {batch_et-batch_st} seconds", + flush=True, + ) + + global_et = time.time() + print( + f"Total time taken for FineWeb Mixtral Edu Classifier inference: {global_et-global_st} s", + flush=True, + ) + client.close() + + +def console_script(): + main() + + +if __name__ == "__main__": + console_script() diff --git a/nemo_curator/scripts/classifiers/fineweb_nemotron_edu_classifier_inference.py b/nemo_curator/scripts/classifiers/fineweb_nemotron_edu_classifier_inference.py new file mode 100644 index 00000000..112453a2 --- /dev/null +++ b/nemo_curator/scripts/classifiers/fineweb_nemotron_edu_classifier_inference.py @@ -0,0 +1,113 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import warnings + +os.environ["RAPIDS_NO_INITIALIZE"] = "1" + +from nemo_curator.classifiers import FineWebNemotronEduClassifier +from nemo_curator.datasets import DocumentDataset + +# Get relevant args +from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.file_utils import get_remaining_files +from nemo_curator.utils.script_utils import ArgumentHelper + +warnings.filterwarnings("ignore") + + +def main(): + args = ArgumentHelper.parse_distributed_classifier_args( + description="Run FineWeb Nemotron-4 Edu Classifier inference." + ).parse_args() + print(f"Arguments parsed = {args}", flush=True) + + client_args = ArgumentHelper.parse_client_args(args) + client_args["cluster_type"] = "gpu" + client = get_client(**client_args) + print("Starting FineWeb Nemotron-4 Edu Classifier inference", flush=True) + global_st = time.time() + files_per_run = len(client.scheduler_info()["workers"]) * 2 + + if not os.path.exists(args.output_data_dir): + os.makedirs(args.output_data_dir) + + # Some times jsonl files are stored as .json + # So to handle that case we can pass the input_file_extension + if args.input_file_extension is not None: + input_file_extension = args.input_file_extension + else: + input_file_extension = args.input_file_type + + input_files = get_remaining_files( + args.input_data_dir, args.output_data_dir, input_file_extension + ) + print(f"Total input files {len(input_files)}", flush=True) + + if args.input_file_type == "pickle": + add_filename = False + else: + add_filename = True + + fineweb_nemotron_edu_classifier = FineWebNemotronEduClassifier( + text_field=args.input_text_field, + batch_size=args.batch_size, + autocast=args.autocast, + max_chars=args.max_chars, + max_mem_gb=args.max_mem_gb_classifier, + ) + + for file_batch_id, i in enumerate(range(0, len(input_files), files_per_run)): + batch_st = time.time() + current_batch_files = input_files[i : i + files_per_run] + print( + f"File Batch ID {file_batch_id}: total input files {len(current_batch_files)}", + flush=True, + ) + df = read_data( + input_files=current_batch_files, + file_type=args.input_file_type, + add_filename=add_filename, + ) + df = fineweb_nemotron_edu_classifier(DocumentDataset(df)).df + print(f"Total input Dask DataFrame partitions {df.npartitions}", flush=True) + + write_to_disk( + df=df, + output_path=args.output_data_dir, + write_to_filename=add_filename, + output_type=args.output_file_type, + ) + batch_et = time.time() + print( + f"File Batch ID {file_batch_id}: completed in {batch_et-batch_st} seconds", + flush=True, + ) + + global_et = time.time() + print( + f"Total time taken for FineWeb Nemotron-4 Edu Classifier inference: {global_et-global_st} s", + flush=True, + ) + client.close() + + +def console_script(): + main() + + +if __name__ == "__main__": + console_script() diff --git a/nemo_curator/synthetic/__init__.py b/nemo_curator/synthetic/__init__.py index 44a4b6c1..1efb3043 100644 --- a/nemo_curator/synthetic/__init__.py +++ b/nemo_curator/synthetic/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,9 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. from .async_nemotron import AsyncNemotronGenerator +from .async_nemotron_cc import AsyncNemotronCCGenerator from .error import YamlConversionError from .mixtral import Mixtral8x7BFormatter from .nemotron import NemotronFormatter, NemotronGenerator +from .nemotron_cc import ( + NemotronCCDiverseQAPostprocessor, + NemotronCCGenerator, + NemotronCCKnowledgeListPostprocessor, +) from .no_format import NoFormat from .prompts import ( DEFAULT_CLOSED_QA_PROMPT_TEMPLATE, @@ -45,6 +51,10 @@ "NemotronGenerator", "AsyncNemotronGenerator", "NemotronFormatter", + "NemotronCCGenerator", + "AsyncNemotronCCGenerator", + "NemotronCCDiverseQAPostprocessor", + "NemotronCCKnowledgeListPostprocessor", "Mixtral8x7BFormatter", "NoFormat", "DEFAULT_MACRO_TOPICS_PROMPT_TEMPLATE", diff --git a/nemo_curator/synthetic/async_nemotron_cc.py b/nemo_curator/synthetic/async_nemotron_cc.py new file mode 100644 index 00000000..8c16e11a --- /dev/null +++ b/nemo_curator/synthetic/async_nemotron_cc.py @@ -0,0 +1,196 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from nemo_curator.services import AsyncLLMClient +from nemo_curator.synthetic.prompts import ( + DISTILL_PROMPT_TEMPLATE, + DIVERSE_QA_PROMPT_TEMPLATE, + EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE, + KNOWLEDGE_LIST_PROMPT_TEMPLATE, + NEMOTRON_CC_DISTILL_SYSTEM_PROMPT, + NEMOTRON_CC_SYSTEM_PROMPT, + WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE, +) + + +class AsyncNemotronCCGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-CC paper (https://arxiv.org/abs/2412.02595). + """ + + def __init__(self, llm_client: AsyncLLMClient) -> None: + """ + Initialize the AsyncNemotronCCGenerator instance. + + Args: + llm_client (LLMClient): The language model client used for querying the model. + """ + self.client = llm_client + + async def _prompt( + self, + model: str, + document: str, + prompt_template: str, + system_prompt: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> List[str]: + prompt = prompt_template.format(document=document, **prompt_kwargs) + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + return await self.client.query_model( + messages=messages, model=model, **model_kwargs + ) + + async def rewrite_to_wikipedia_style( + self, + document: str, + model: str, + prompt_template: str = WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Rewrites a document into a Wikipedia-style narrative. + + Args: + document (str): The input document text to rewrite. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for rewriting. Defaults to WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return await self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + async def generate_diverse_qa( + self, + document: str, + model: str, + prompt_template: str = DIVERSE_QA_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Generates diverse QA pairs from the provided document. + + Args: + document (str): The input document text used to generate QA pairs. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for generating QA pairs. Defaults to DIVERSE_QA_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return await self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + async def distill( + self, + document: str, + model: str, + prompt_template: str = DISTILL_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_DISTILL_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Distills the essential content from a document. + + Args: + document (str): The input document text to distill. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for distillation. Defaults to DISTILL_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_DISTILL_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return await self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + async def extract_knowledge( + self, + document: str, + model: str, + prompt_template: str = EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Extracts knowledge from the provided document. + + Args: + document (str): The input document text from which to extract knowledge. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for knowledge extraction. Defaults to EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return await self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + async def generate_knowledge_list( + self, + document: str, + model: str, + prompt_template: str = KNOWLEDGE_LIST_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Generates a list of knowledge items from the provided document. + + Args: + document (str): The input document text to process. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for generating a knowledge list. Defaults to KNOWLEDGE_LIST_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return await self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) diff --git a/nemo_curator/synthetic/nemotron_cc.py b/nemo_curator/synthetic/nemotron_cc.py new file mode 100644 index 00000000..24629945 --- /dev/null +++ b/nemo_curator/synthetic/nemotron_cc.py @@ -0,0 +1,325 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +from typing import Any, List, Optional + +from transformers import AutoTokenizer + +from nemo_curator import BaseModule +from nemo_curator.datasets import DocumentDataset +from nemo_curator.services import LLMClient +from nemo_curator.synthetic.prompts import ( + DISTILL_PROMPT_TEMPLATE, + DIVERSE_QA_PROMPT_TEMPLATE, + EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE, + KNOWLEDGE_LIST_PROMPT_TEMPLATE, + NEMOTRON_CC_DISTILL_SYSTEM_PROMPT, + NEMOTRON_CC_SYSTEM_PROMPT, + WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE, +) + + +class NemotronCCGenerator: + """ + Provides a collection of methods for generating synthetic data + described in the Nemotron-CC paper (https://arxiv.org/abs/2412.02595). + """ + + def __init__(self, llm_client: LLMClient) -> None: + """ + Initialize the NemotronCCGenerator instance. + + Args: + llm_client (LLMClient): The language model client used for querying the model. + """ + self.client = llm_client + + def _prompt( + self, + model: str, + document: str, + prompt_template: str, + system_prompt: str, + prompt_kwargs: dict, + model_kwargs: dict, + ) -> List[str]: + prompt = prompt_template.format(document=document, **prompt_kwargs) + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ] + + return self.client.query_model(messages=messages, model=model, **model_kwargs) + + def rewrite_to_wikipedia_style( + self, + document: str, + model: str, + prompt_template: str = WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Rewrites a document into a Wikipedia-style narrative. + + Args: + document (str): The input document text to rewrite. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for rewriting. Defaults to WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + def generate_diverse_qa( + self, + document: str, + model: str, + prompt_template: str = DIVERSE_QA_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Generates diverse QA pairs from the provided document. + + Args: + document (str): The input document text used to generate QA pairs. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for generating QA pairs. Defaults to DIVERSE_QA_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + def distill( + self, + document: str, + model: str, + prompt_template: str = DISTILL_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_DISTILL_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Distills the essential content from a document. + + Args: + document (str): The input document text to distill. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for distillation. Defaults to DISTILL_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_DISTILL_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + def extract_knowledge( + self, + document: str, + model: str, + prompt_template: str = EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Extracts knowledge from the provided document. + + Args: + document (str): The input document text from which to extract knowledge. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for knowledge extraction. Defaults to EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + def generate_knowledge_list( + self, + document: str, + model: str, + prompt_template: str = KNOWLEDGE_LIST_PROMPT_TEMPLATE, + system_prompt: str = NEMOTRON_CC_SYSTEM_PROMPT, + prompt_kwargs: dict = {}, + model_kwargs: dict = {}, + ) -> List[str]: + """ + Generates a list of knowledge items from the provided document. + + Args: + document (str): The input document text to process. + model (str): The model identifier to use. + prompt_template (str, optional): The prompt template for generating a knowledge list. Defaults to KNOWLEDGE_LIST_PROMPT_TEMPLATE. + system_prompt (str, optional): The system prompt to use. Defaults to NEMOTRON_CC_SYSTEM_PROMPT. + prompt_kwargs (dict, optional): Additional keyword arguments for the prompt. Defaults to {}. + model_kwargs (dict, optional): Additional keyword arguments for the model invocation. Defaults to {}. + + Returns: + List[str]: A list of responses from the LLM. The list is only greater than length 1 if n > 1 is set in model_kwargs. + """ + return self._prompt( + model, document, prompt_template, system_prompt, prompt_kwargs, model_kwargs + ) + + +class NemotronCCDiverseQAPostprocessor(BaseModule): + """ + Postprocesses the output of the Nemotron-CC Diverse QA generation pipeline. + This postprocessor will sample a random number of QA pairs up to max_num_pairs. + If a tokenizer is provided, the number of QA pairs will be sampled from at least 1 and at most floor(max_num_pairs * num_tokens / 150). + Otherwise, the number of QA pairs will be sampled randomly strictly up to max_num_pairs. + + The generated QA pairs are shuffled and then appended to the original text. + """ + + def __init__( + self, + tokenizer: Optional[AutoTokenizer] = None, + text_field: str = "text", + response_field: str = "response", + max_num_pairs: int = 1, + prefix: str = "Here are the questions and answers based on the provided text:", + ) -> None: + """ + Args: + tokenizer (Optional[AutoTokenizer]): The tokenizer to use for tokenization. + If specified, the number of QA pairs will be sampled based on the token count of the text. + If not specified, the number of QA pairs will be sampled randomly up to max_num_pairs. + text_field (str): The field in the dataset that contains the text used to generate QA pairs. + response_field (str): The field in the dataset that contains the response from the LLM. + max_num_pairs (int): The maximum number of QA pairs to sample. + prefix (str): The prefix of the response from the LLM. + """ + super().__init__(input_backend="pandas") + self.tokenizer = tokenizer + self.text_field = text_field + self.response_field = response_field + self.max_num_pairs = max_num_pairs + self.prefix = prefix + + def _postprocess_llm_response(self, text: str, llm_response: str) -> str: + lines = [line.strip() for line in llm_response.split("\n") if line.strip()] + if not lines: + return "" + + # Remove the "- " prefix + lines = [line[2:].strip() if line.startswith("- ") else line for line in lines] + + if lines[0] == self.prefix: + lines = lines[1:] + + # Merge question and answer lines + qa_pairs = [] + for line in lines: + if line.startswith("Question:"): + qa_pairs.append(line) + else: + if qa_pairs: + qa_pairs[-1] += "\n" + line + else: + return "" + + if len(qa_pairs) == 0: + return "" + + # Shuffle the QA pairs and sample up to max_num_pairs + random.shuffle(qa_pairs) + if self.tokenizer is not None: + num_tokens = len(self.tokenizer.tokenize(text)) + qa_pairs = qa_pairs[ + : random.randint(1, max(1, int(self.max_num_pairs * num_tokens / 150))) + ] + else: + qa_pairs = qa_pairs[: random.randint(1, self.max_num_pairs)] + qa_pairs_str = "\n\n".join(qa_pairs) + + # Concatenate the document and the QA pairs + return f"{text}\n\n{qa_pairs_str}" + + def call(self, dataset: DocumentDataset) -> DocumentDataset: + df = dataset.df + df[self.response_field] = df.apply( + lambda row: self._postprocess_llm_response( + row[self.text_field], row[self.response_field] + ), + axis=1, + meta=(None, "object"), + ) + df = df[df[self.response_field] != ""] + + return DocumentDataset(df) + + +# Although this could be implemented as a DocumentModifier, +# I have kept it separate to match the other postprocessors. +class NemotronCCKnowledgeListPostprocessor(BaseModule): + """ + Processes and cleans the output generated by the Nemotron-CC Knowledge List pipeline. + + This class is responsible for postprocessing raw text responses produced by the + Nemotron-CC Knowledge List generation pipeline. It removes formatting artifacts + such as bullet point prefixes ("- ") and extra indentation from each line, ensuring + that the final output is a clean, uniformly formatted list of knowledge items. + The processing includes skipping any initial non-bullet line and merging related lines + to reconstruct multi-line questions or answers. + """ + + def __init__(self, text_field: str = "text") -> None: + super().__init__(input_backend="pandas") + self.text_field = text_field + + def _postprocess_llm_response(self, text: str) -> str: + lines = [] + for idx, line in enumerate(text.split("\n")): + if idx == 0 and not line.startswith("-"): + continue + + if line.startswith(" ") or line.startswith("- "): + lines.append(line[2:].strip()) + else: + lines.append(line) + return "\n".join(lines) + + def call(self, dataset: DocumentDataset) -> DocumentDataset: + df = dataset.df + df[self.text_field] = df[self.text_field].apply( + self._postprocess_llm_response, meta=(self.text_field, "object") + ) + return DocumentDataset(df) diff --git a/nemo_curator/synthetic/prompts.py b/nemo_curator/synthetic/prompts.py index fbe7e026..1cdf53ed 100644 --- a/nemo_curator/synthetic/prompts.py +++ b/nemo_curator/synthetic/prompts.py @@ -56,3 +56,83 @@ DIALOGUE_COMPLEX_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the tone of User. Make sure the question is complex and diverse enough and suitable as a followup question. Directly give me the question without extraneous words." DIALOGUE_CONCISE_USER_TURN_PROMPT_TEMPLATE = "Here is a conversation between a user and an assistant.\n<|The Start of Assistant's Conversation with User|>\n{conversation_history}\n<|The End of Assistant's Conversation with User|>\n\nGiven the conversation above, generate a followup request or question in the toneof User. Be critical. Make sure the question is concise and has a real-life tone. Directly give me the question without extraneous words." + + +# Nemotron-CC prompts + +NEMOTRON_CC_SYSTEM_PROMPT = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the questions." + +NEMOTRON_CC_DISTILL_SYSTEM_PROMPT = "You are an artificial intelligence assistant. You carefully provide accurate, factual, thoughtful, nuanced answers, and are brilliant at reasoning." + +WIKIPEDIA_REPHRASING_PROMPT_TEMPLATE = """For the following paragraph give me a diverse paraphrase of the same in high quality English language as in sentences on Wikipedia. Begin your answer on a separate line with "Here is a paraphrased version:". + +Text: {document}""" + +DIVERSE_QA_PROMPT_TEMPLATE = """Task: +Read the text, ask questions and answer them. + +Follow these instructions: +1. Ask diverse questions that require different cognitive skills or cover different aspects of the text. +2. Ask questions in various forms such as: + - Yes/No questions that require determining whether a statement is true or false. + - Open-ended questions that begin with words like what, how, when, where, why and who. + - Multi-choice questions that offers two or more options to choose from. Include the options in the question. + - Comparison questions that compare two quantities or objects and determine the relationship between them. + - Reading comprehension questions that test the ability to understand and analyze the text. + - Problem-solving questions that test the ability to solve mathematical, physical, or logical problems. +3. Focus on asking questions about factual information, important knowledge, or concrete details in the text. +4. Write questions and answers using clear and concise language. +5. Use plain text. Do not use Markdown. +6. Each question and answer pair should be on a separate line. Tag the question with "Question:" and the answer with "Answer:". + +Text: +{document} + +Task: +After reading the above text, ask up to 8 questions and provide the correct answers following the instructions. Give your response in this format: + +Here are the questions and answers based on the provided text: +- Question: [first question] Answer: [first answer] +- Question: [second question] Answer: [second answer] +....""" + +DISTILL_PROMPT_TEMPLATE = """Your task is to read and paraphrase the provided text following these instructions: +- Aim to create a condensed but accurate and informative version of the original text, not a simplistic summary. +- Capture and preserve the crucial information, key concepts, important values, factual details in the original text, while making it more readable and accessible. +- Retain technical terms, specialized vocabulary, and complex concepts. +- Retain examples, explanations of reasoning processes, and supporting evidence to maintain the text's depth and context. +- Only include information that is present in the original text. Do not adding new or unsubstantiated claims. +- Write the text in plain text without formatting. + +Here is the text: +{document} + +Task: +After thoroughly reading the above text, paraphrase it in high-quality and clear English following the instructions. Begin your response with "Paraphrased Text:".""" + +EXTRACT_KNOWLEDGE_PROMPT_TEMPLATE = """Your task is to rewrite knowledge from the provided text following these instructions. +- Rewrite the text as a passage or passages using easy-to-understand and high-quality English like sentences in textbooks and Wikipedia. +- Focus on content in disciplines such as humanities, social sciences, natural sciences, technology, engineering, math, law and legal, business, management, art, education, agricultural sciences, politics, and history. +- Disregard content that does not contain useful facts or knowledge. +- Retain examples, explanations of reasoning processes, and supporting evidence to maintain the text's depth and context. +- Do not add or alter details. Only restate what is already in the text. +- Write in plain text. +- Do not add titles, subtitles, note, or comment. + +Text: +{document} + +Task: +Rewrite facts and knowledge from the above text as a passage or passages following the instructions.""" + +KNOWLEDGE_LIST_PROMPT_TEMPLATE = """Review the text and extract the key information. Follow these instructions: +- Carefully read the above text and provide a concise and organized list of factual information, concrete details, key concepts, and important numbers and statistics extracted from the text. +- Ensure each point is clear, specific, and supported by the original text. +- Ensure the extract text is information-dense and easier to learn from. +- Do not add titles or headings. + +Text: +{document} + +Task: +Extract the factual information, concrete details, and key concepts from the above text following the instructions.""" diff --git a/nemo_curator/utils/duplicates_removal.py b/nemo_curator/utils/duplicates_removal.py new file mode 100644 index 00000000..ea654515 --- /dev/null +++ b/nemo_curator/utils/duplicates_removal.py @@ -0,0 +1,73 @@ +from typing import List, Union + +import dask.dataframe as dd + + +def deduplicate_groups( + duplicates: dd.DataFrame, group_field: str, perform_shuffle: bool +) -> dd.DataFrame: + if perform_shuffle: + # Redistribute data across partitions so that all duplicates are in same partition + duplicates_shuffled = duplicates.shuffle(on=[group_field], ignore_index=True) + else: + duplicates_shuffled = duplicates + + duplicates_to_remove = ( + duplicates_shuffled + # For each partition, keep only the duplicated rows (excluding first occurrence) + .map_partitions(lambda x: x[x[group_field].duplicated(keep="first")]).drop( + columns=group_field + ) + ) + return duplicates_to_remove + + +def left_anti_join( + left: dd.DataFrame, + right: dd.DataFrame, + left_on: Union[str, List[str]], + right_on: Union[str, List[str]], +): + assert left_on != right_on, "left_on and right_on cannot be the same" + merge = left.merge( + right=right, + how="left", + broadcast=True, # Broadcast smaller DataFrame to all partitions + left_on=left_on, + right_on=right_on, + ) + + # This effectively removes all rows that were not in duplicates_to_remove + removed_result = merge[merge[right_on].isna()].drop(columns=[right_on]) + return removed_result + + +def remove_duplicates( + left: dd.DataFrame, + duplicates: dd.DataFrame, + id_field: str, + group_field: str, + perform_shuffle: bool = False, +) -> dd.DataFrame: + if left.npartitions < duplicates.npartitions: + msg = ( + "The number of partitions in `left` is less than the number of partitions in the duplicates dataset. " + "This may lead to a shuffle join. Please re-read left and right with different partition sizes, or repartition left / right." + ) + raise ValueError(msg) + + # Create a new column name for temporary ID storage during merge + new_id_field = f"{id_field}_new" + + duplicates_to_remove = ( + deduplicate_groups(duplicates, group_field, perform_shuffle) + # Rename the ID field to avoid conflicts in the upcoming merge + .rename(columns={id_field: new_id_field})[[new_id_field]] + ) + + return left_anti_join( + left=left, + right=duplicates_to_remove, + left_on=id_field, + right_on=new_id_field, + ) diff --git a/pyproject.toml b/pyproject.toml index d298ae67..56e0fd9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,8 +66,7 @@ dependencies = [ "resiliparse", "sentencepiece", "spacy>=3.6.0, <3.8.0", - # TODO: Remove this pin once newer version is released - "transformers==4.46.3", + "transformers>=4.48.0", "unidic-lite==1.0.8", "usaddress==0.5.10", "warcio==1.7.4", @@ -156,6 +155,8 @@ instruction_data_guard_classifier_inference = "nemo_curator.scripts.classifiers. multilingual_domain_classifier_inference = "nemo_curator.scripts.classifiers.multilingual_domain_classifier_inference:console_script" content_type_classifier_inference = "nemo_curator.scripts.classifiers.content_type_classifier_inference:console_script" prompt_task_complexity_classifier_inference = "nemo_curator.scripts.classifiers.prompt_task_complexity_classifier_inference:console_script" +fineweb_mixtral_edu_classifier_inference = "nemo_curator.scripts.classifiers.fineweb_mixtral_edu_classifier_inference:console_script" +fineweb_nemotron_edu_classifier_inference = "nemo_curator.scripts.classifiers.fineweb_nemotron_edu_classifier_inference:console_script" verify_classification_results = "nemo_curator.scripts.verify_classification_results:console_script" blend_datasets = "nemo_curator.scripts.blend_datasets:console_script" semdedup_extract_embeddings = "nemo_curator.scripts.semdedup.compute_embeddings:console_script" diff --git a/tests/test_backends.py b/tests/test_backends.py index 6acd87b5..112bde6f 100644 --- a/tests/test_backends.py +++ b/tests/test_backends.py @@ -14,7 +14,6 @@ import pandas as pd import pytest from dask.dataframe.utils import assert_eq -from distributed import Client from nemo_curator import ( BaseModule, @@ -26,11 +25,10 @@ ) from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import MeanWordLengthFilter -from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from +from nemo_curator.utils.import_utils import gpu_only_import cudf = gpu_only_import("cudf") dask_cudf = gpu_only_import("dask_cudf") -LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") class CPUModule(BaseModule): @@ -98,18 +96,12 @@ def gpu_data(raw_data): @pytest.mark.gpu class TestBackendSupport: - @pytest.fixture(autouse=True, scope="class") - def gpu_client(self, request): - with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: - request.cls.client = client - request.cls.cluster = cluster - yield - def test_pandas_backend( self, cpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, gt_lengths = cpu_data pipeline = CPUModule() result = pipeline(dataset) @@ -119,8 +111,9 @@ def test_pandas_backend( def test_cudf_backend( self, gpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, gt_lengths = gpu_data pipeline = GPUModule() result = pipeline(dataset) @@ -131,8 +124,9 @@ def test_any_backend( self, cpu_data, gpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) cpu_dataset, gt_cpu_lengths = cpu_data gt_cpu_lengths = gt_cpu_lengths.rename("any_lengths") gpu_dataset, gt_gpu_lengths = gpu_data @@ -150,8 +144,9 @@ def test_pandas_to_cudf( self, cpu_data, gpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, gt_cpu_lengths = cpu_data _, gt_gpu_lengths = gpu_data pipeline = Sequential( @@ -170,8 +165,9 @@ def test_cudf_to_pandas( self, cpu_data, gpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) _, gt_cpu_lengths = cpu_data dataset, gt_gpu_lengths = gpu_data pipeline = Sequential( @@ -190,8 +186,9 @@ def test_5x_switch( self, cpu_data, gpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, gt_cpu_lengths = cpu_data _, gt_gpu_lengths = gpu_data pipeline = Sequential( @@ -220,25 +217,25 @@ def test_5x_switch( assert_eq(result_df["cpu_lengths"], gt_cpu_lengths) assert_eq(result_df["gpu_lengths"], gt_gpu_lengths) - def test_wrong_backend_cpu_data(self, cpu_data): + def test_wrong_backend_cpu_data(self, cpu_data, gpu_client): with pytest.raises(ValueError): - print("client", self.client) + print("client", gpu_client) dataset, _ = cpu_data pipeline = GPUModule() result = pipeline(dataset) _ = result.df.compute() - def test_wrong_backend_gpu_data(self, gpu_data): + def test_wrong_backend_gpu_data(self, gpu_data, gpu_client): with pytest.raises(ValueError): - print("client", self.client) + print("client", gpu_client) dataset, _ = gpu_data pipeline = CPUModule() result = pipeline(dataset) _ = result.df.compute() - def test_unsupported_to_backend(self, cpu_data): + def test_unsupported_to_backend(self, cpu_data, gpu_client): with pytest.raises(ValueError): - print("client", self.client) + print("client", gpu_client) dataset, _ = cpu_data pipeline = ToBackend("fake_backend") result = pipeline(dataset) @@ -281,18 +278,12 @@ def real_module_gpu_data(real_module_raw_data): @pytest.mark.gpu class TestRealModules: - @pytest.fixture(autouse=True, scope="class") - def gpu_client(self, request): - with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: - request.cls.client = client - request.cls.cluster = cluster - yield - def test_score_filter( self, real_module_cpu_data, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, gt_results = real_module_cpu_data pipeline = ScoreFilter( MeanWordLengthFilter(), score_field="mean_lengths", score_type=float @@ -304,9 +295,10 @@ def test_score_filter( def test_score_filter_wrong_backend( self, real_module_gpu_data, + gpu_client, ): with pytest.raises(ValueError): - print("client", self.client) + print("client", gpu_client) dataset, _ = real_module_gpu_data pipeline = ScoreFilter( MeanWordLengthFilter(), score_field="mean_lengths", score_type=float @@ -318,8 +310,9 @@ def test_fuzzy_dedup( self, real_module_gpu_data, tmpdir, + gpu_client, ): - print(self.client) + print(gpu_client) dataset, gt_results = real_module_gpu_data # Dedup might fail when indices per partition do not start from 0 dataset.df = dataset.df.reset_index(drop=True) @@ -355,9 +348,10 @@ def test_fuzzy_dedup_wrong_backend( self, real_module_cpu_data, tmpdir, + gpu_client, ): with pytest.raises(ValueError): - print(self.client) + print(gpu_client) dataset, _ = real_module_cpu_data # Dedup might fail when indices per partition do not start from 0 dataset.df = dataset.df.reset_index(drop=True) @@ -384,8 +378,9 @@ def test_score_filter_and_fuzzy( real_module_cpu_data, real_module_gpu_data, tmpdir, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) dataset, _ = real_module_cpu_data _, gt_results = real_module_gpu_data dataset.df = dataset.df.reset_index(drop=True) diff --git a/tests/test_classifiers.py b/tests/test_classifiers.py index 1d37e7f5..d6d2852c 100644 --- a/tests/test_classifiers.py +++ b/tests/test_classifiers.py @@ -13,22 +13,12 @@ # limitations under the License. import pytest -from distributed import Client from nemo_curator.datasets import DocumentDataset -from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from +from nemo_curator.utils.import_utils import gpu_only_import cudf = gpu_only_import("cudf") dask_cudf = gpu_only_import("dask_cudf") -LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") - - -@pytest.fixture -def gpu_client(request): - with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: - request.client = client - request.cluster = cluster - yield @pytest.fixture @@ -149,6 +139,38 @@ def test_fineweb_edu_classifier(gpu_client, domain_dataset): assert result_pred.equals(expected_pred) +@pytest.mark.skip( + reason="Skipping until https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier is published" +) +@pytest.mark.gpu +def test_fineweb_mixtral_classifier(gpu_client, domain_dataset): + from nemo_curator.classifiers import FineWebMixtralEduClassifier + + classifier = FineWebMixtralEduClassifier() + result_dataset = classifier(dataset=domain_dataset) + result_pred = result_dataset.df.compute()["fineweb-mixtral-edu-score-int"] + + expected_pred = cudf.Series([1, 1, 1, 2, 0]) + + assert result_pred.equals(expected_pred) + + +@pytest.mark.skip( + reason="Skipping until https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier is published" +) +@pytest.mark.gpu +def test_fineweb_nemotron_classifier(gpu_client, domain_dataset): + from nemo_curator.classifiers import FineWebNemotronEduClassifier + + classifier = FineWebNemotronEduClassifier() + result_dataset = classifier(dataset=domain_dataset) + result_pred = result_dataset.df.compute()["fineweb-nemotron-edu-score-int"] + + expected_pred = cudf.Series([1, 1, 1, 2, 0]) + + assert result_pred.equals(expected_pred) + + @pytest.mark.skip( reason="Instruction Data Guard needs to be downloaded and cached to our gpuCI runner to enable this" ) diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py index 906da391..539fe49b 100644 --- a/tests/test_cleaning.py +++ b/tests/test_cleaning.py @@ -14,17 +14,26 @@ import dask.dataframe as dd import pandas as pd +from dask.dataframe.utils import assert_eq from nemo_curator import Modify from nemo_curator.datasets import DocumentDataset -from nemo_curator.modifiers import NewlineNormalizer, UnicodeReformatter, UrlRemover +from nemo_curator.modifiers import ( + LineRemover, + MarkdownRemover, + NewlineNormalizer, + QuotationRemover, + Slicer, + UnicodeReformatter, + UrlRemover, +) def list_to_dataset(documents, col_name="text", npartitions=2): data = {col_name: documents} pdf = pd.DataFrame(data) - return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + return DocumentDataset.from_pandas(pdf, npartitions=npartitions) class TestUnicodeReformatter: @@ -149,3 +158,324 @@ def test_urls(self): assert ( expected_results == actual_results ), f"Expected: {expected_results}, but got: {actual_results}" + + +class TestLineRemover: + def test_remove_exact_match(self): + text = "Keep this\nRemove me\nAlso keep this\nRemove me" + patterns = ["Remove me"] + remover = LineRemover(patterns) + result = remover.modify_document(text) + expected = "Keep this\nAlso keep this" + assert result == expected + + def test_no_removal_when_partial_match(self): + text = ( + "Keep this line\nThis line contains Remove me as a part of it\nAnother line" + ) + patterns = ["Remove me"] + remover = LineRemover(patterns) + # Only lines that exactly match "Remove me" are removed. + assert remover.modify_document(text) == text + + def test_empty_input(self): + text = "" + patterns = ["Remove me"] + remover = LineRemover(patterns) + result = remover.modify_document(text) + assert result == "" + + def test_multiple_patterns(self): + text = "Line one\nDelete\nLine two\nRemove\nLine three\nDelete" + patterns = ["Delete", "Remove"] + remover = LineRemover(patterns) + result = remover.modify_document(text) + expected = "Line one\nLine two\nLine three" + assert result == expected + + def test_whitespace_sensitivity(self): + # Exact match requires identical string content. + text = "Remove me \nRemove me\n Remove me" + patterns = ["Remove me"] + remover = LineRemover(patterns) + result = remover.modify_document(text) + # Only the line that exactly equals "Remove me" is removed. + expected = "Remove me \n Remove me" + assert result == expected + + def test_dataset_modification(self): + docs = [ + "Keep this\nRemove me\nKeep that", + "Remove me\nDon't remove\nRemove me", + "No removal here", + "Remove me", + ] + expected_results = [ + "Keep this\nKeep that", + "Don't remove", + "No removal here", + "", + ] + dataset = list_to_dataset(docs) + modifier = Modify(LineRemover(["Remove me"])) + fixed_dataset = modifier(dataset) + expected_dataset = list_to_dataset(expected_results) + assert_eq(fixed_dataset.df, expected_dataset.df) + + +class TestQuotationRemover: + def test_remove_quotes_no_newline(self): + text = '"Hello, World!"' + remover = QuotationRemover() + result = remover.modify_document(text) + expected = "Hello, World!" + assert result == expected + + def test_no_removal_when_quotes_not_enclosing(self): + text = 'Hello, "World!"' + remover = QuotationRemover() + result = remover.modify_document(text) + # The text does not start and end with a quotation mark. + assert result == text + + def test_remove_quotes_with_newline_removal(self): + text = '"Hello,\nWorld!"' + remover = QuotationRemover() + result = remover.modify_document(text) + # Since there is a newline and the first line does not end with a quote, + # the quotes are removed. + expected = "Hello,\nWorld!" + assert result == expected + + def test_no_removal_with_newline_preserved(self): + text = '"Hello,"\nWorld!"' + remover = QuotationRemover() + result = remover.modify_document(text) + # The first line ends with a quote so the removal does not occur. + assert result == text + + def test_short_text_no_removal(self): + text = '""' + remover = QuotationRemover() + result = remover.modify_document(text) + # With text length not greater than 2 (after stripping), nothing changes. + assert result == text + + def test_extra_whitespace_prevents_removal(self): + # If leading/trailing whitespace prevents the text from starting with a quote, + # nothing is changed. + text = ' "Test Message" ' + remover = QuotationRemover() + result = remover.modify_document(text) + assert result == text + + def test_dataset_modification(self): + import pandas as pd + from dask.dataframe.utils import assert_eq + + docs = ['"Document one"', 'Start "Document two" End', '"Document\nthree"', '""'] + expected_results = [ + "Document one", + 'Start "Document two" End', + "Document\nthree", + '""', + ] + dataset = list_to_dataset(docs) + modifier = Modify(QuotationRemover()) + fixed_dataset = modifier(dataset) + expected_dataset = list_to_dataset(expected_results) + assert_eq(fixed_dataset.df, expected_dataset.df) + + +class TestSlicer: + def test_integer_indices(self): + text = "Hello, world!" + slicer = Slicer(left=7, right=12) + result = slicer.modify_document(text) + expected = "world" + assert result == expected + + def test_left_string_including(self): + text = "abcXYZdef" + slicer = Slicer(left="XYZ", include_left=True) + result = slicer.modify_document(text) + expected = "XYZdef" + assert result == expected + + def test_left_string_excluding(self): + text = "abcXYZdef" + slicer = Slicer(left="XYZ", include_left=False) + result = slicer.modify_document(text) + expected = "def" + assert result == expected + + def test_right_string_including(self): + text = "abcXYZdef" + slicer = Slicer(right="XYZ", include_right=True) + result = slicer.modify_document(text) + expected = "abcXYZ" + assert result == expected + + def test_right_string_excluding(self): + text = "abcXYZdef" + slicer = Slicer(right="XYZ", include_right=False) + result = slicer.modify_document(text) + expected = "abc" + assert result == expected + + def test_both_left_and_right_with_strings(self): + text = "start middle end" + slicer = Slicer( + left="start", right="end", include_left=False, include_right=False + ) + result = slicer.modify_document(text) + # "start" is removed and "end" is excluded; extra spaces are stripped. + expected = "middle" + assert result == expected + + def test_non_existing_left(self): + text = "abcdef" + slicer = Slicer(left="nonexistent") + result = slicer.modify_document(text) + assert result == "" + + def test_non_existing_right(self): + text = "abcdef" + slicer = Slicer(right="nonexistent") + result = slicer.modify_document(text) + assert result == "" + + def test_no_left_no_right(self): + text = " some text with spaces " + slicer = Slicer() + result = slicer.modify_document(text) + # With no boundaries specified, the entire text is returned (stripped). + expected = "some text with spaces" + assert result == expected + + def test_integer_out_of_range(self): + text = "short" + slicer = Slicer(left=10) + result = slicer.modify_document(text) + # Slicing starting beyond the text length yields an empty string. + assert result == "" + + def test_multiple_occurrences(self): + text = "abc__def__ghi" + # Testing when markers appear multiple times. + slicer = Slicer(left="__", right="__", include_left=True, include_right=True) + result = slicer.modify_document(text) + # left: first occurrence at index 3; right: last occurrence at index 8, include_right adds len("__") + expected = "__def__" + assert result == expected + + def test_dataset_modification(self): + import pandas as pd + from dask.dataframe.utils import assert_eq + + docs = ["abcdef", "0123456789", "Hello", "Slicer"] + expected_results = [ + "cde", # "abcdef" sliced from index 2 to 5 + "234", # "0123456789" sliced from index 2 to 5 + "llo", # "Hello" sliced from index 2 to 5 + "ice", # "Slicer" sliced from index 2 to 5 + ] + dataset = list_to_dataset(docs) + modifier = Modify(Slicer(left=2, right=5)) + fixed_dataset = modifier(dataset) + expected_dataset = list_to_dataset(expected_results) + assert_eq(fixed_dataset.df, expected_dataset.df) + + +class TestMarkdownRemover: + def test_bold_removal(self): + text = "This is **bold** text." + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "This is bold text." + assert result == expected + + def test_italic_removal(self): + text = "This is *italic* text." + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "This is italic text." + assert result == expected + + def test_underline_removal(self): + text = "This is _underlined_ text." + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "This is underlined text." + assert result == expected + + def test_link_removal(self): + text = "Link: [Google](https://google.com)" + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "Link: https://google.com" + assert result == expected + + def test_multiple_markdown(self): + text = "This is **bold**, *italic*, and _underline_, check [Example](https://example.com)" + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "This is bold, italic, and underline, check https://example.com" + assert result == expected + + def test_no_markdown(self): + text = "This line has no markdown." + remover = MarkdownRemover() + result = remover.modify_document(text) + assert result == text + + def test_incomplete_markdown(self): + text = "This is *italic text" + remover = MarkdownRemover() + result = remover.modify_document(text) + # Without a closing '*', the text remains unchanged. + assert result == text + + def test_nested_markdown(self): + text = "This is **bold and *italic* inside** text." + remover = MarkdownRemover() + result = remover.modify_document(text) + # Bold formatting is removed first, then italics in the resulting string. + expected = "This is bold and italic inside text." + assert result == expected + + def test_multiple_lines(self): + text = "**Bold line**\n*Italic line*\n_Normal line_" + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "Bold line\nItalic line\nNormal line" + assert result == expected + + def test_adjacent_markdown(self): + text = "**Bold****MoreBold**" + remover = MarkdownRemover() + result = remover.modify_document(text) + expected = "BoldMoreBold" + assert result == expected + + def test_dataset_modification(self): + import pandas as pd + from dask.dataframe.utils import assert_eq + + docs = [ + "This is **bold**", + "This is *italic*", + "Check [Link](https://example.com)", + "No markdown here", + ] + expected_results = [ + "This is bold", + "This is italic", + "Check https://example.com", + "No markdown here", + ] + dataset = list_to_dataset(docs) + modifier = Modify(MarkdownRemover()) + fixed_dataset = modifier(dataset) + expected_dataset = list_to_dataset(expected_results) + assert_eq(fixed_dataset.df, expected_dataset.df) diff --git a/tests/test_duplicates_removal.py b/tests/test_duplicates_removal.py new file mode 100644 index 00000000..3f406681 --- /dev/null +++ b/tests/test_duplicates_removal.py @@ -0,0 +1,208 @@ +from typing import Literal + +import pandas as pd +import pytest +from dask import dataframe as dd + +from nemo_curator.utils.duplicates_removal import remove_duplicates + + +@pytest.fixture() +def ids(): + # Dataset has id a0...a9, b0...b9, c0...c9, d0...d9 + l = [f"{group}{i}" for group in ["a", "b", "c", "d"] for i in range(10)] + return l + + +@pytest.fixture +def sample_data(ids): + df = pd.DataFrame( + { + "id": ids, + "text": [f"text for {_id}" for _id in ids], + } + ) + return dd.from_pandas(df, npartitions=4) + + +@pytest.fixture +def duplicate_data(ids): + # In each group we want to keep only the first occurrence (e.g. a1, b1, c1, d1) + df = pd.DataFrame([{"id": _id, "group": _id[0]} for _id in ids]) + return dd.from_pandas(df, npartitions=2) + + +@pytest.mark.parametrize( + "backend", + [ + "pandas", + pytest.param("cudf", marks=pytest.mark.gpu), + ], +) +@pytest.mark.parametrize("perform_shuffle", [False, True]) +def test_remove_duplicates_basic( + backend: Literal["cudf", "pandas"], + perform_shuffle: bool, + sample_data: dd.DataFrame, + duplicate_data: dd.DataFrame, +): + if perform_shuffle: + # We shuffle the data to make sure that duplicates are not in the same partition + duplicate_data = duplicate_data.sample(frac=1).reset_index(drop=True) + + sample_data = sample_data.to_backend(backend) + duplicate_data = duplicate_data.to_backend(backend) + + # Test basic duplicate removal functionality + result = remove_duplicates( + left=sample_data, + duplicates=duplicate_data, + id_field="id", + group_field="group", + perform_shuffle=perform_shuffle, + ).to_backend("pandas") + + result = result.compute() + + assert list(result.columns) == ["id", "text"] + assert len(result) == 4 + # It's not guaranteed that we'll have a0, b0, c0, d0 in the result + # So we should check the first character + assert set(result["id"].apply(lambda x: x[0]).tolist()) == set(["a", "b", "c", "d"]) + + +@pytest.mark.parametrize( + "backend", + [ + "pandas", + pytest.param("cudf", marks=pytest.mark.gpu), + ], +) +@pytest.mark.parametrize("perform_shuffle", [False, True]) +def test_remove_duplicates_all_duplicates( + backend: Literal["cudf", "pandas"], + perform_shuffle: bool, + ids: list[str], + sample_data: dd.DataFrame, +): + + duplicates = dd.from_pandas( + pd.DataFrame({"id": ids, "group": [1] * len(ids)}), npartitions=2 + ) + sample_data = sample_data.to_backend(backend) + duplicates = duplicates.to_backend(backend) + + result = remove_duplicates( + left=sample_data, + duplicates=duplicates, + id_field="id", + group_field="group", + perform_shuffle=perform_shuffle, + ).to_backend("pandas") + + assert list(result.columns) == ["id", "text"] + result = result.compute() + if perform_shuffle: + assert len(result) == 1 + else: + # If we don't shuffle, and both partitions have the same group + # in both partitions we'd be left with 1 row after "deduplication" + # and after the left-anti join we'd be left with 2 rows + assert len(result) == 2 + + +@pytest.mark.parametrize( + "backend", + [ + "pandas", + pytest.param("cudf", marks=pytest.mark.gpu), + ], +) +@pytest.mark.parametrize("perform_shuffle", [False, True]) +def test_not_remove_duplicates_unique( + backend: Literal["cudf", "pandas"], + perform_shuffle: bool, + ids: list[str], + sample_data: dd.DataFrame, +): + # We create a dataset where first 30 ids are in one group + # Next 9 ids are in distinct groups + # And last id is not mentioned in duplicates + + duplicates = dd.from_pandas( + pd.DataFrame( + { + "id": ids[:30] + ids[30:39], + "group": ["group0"] * 30 + [f"group{i}" for i in range(1, 10)], + } + ), + npartitions=2, + ) + sample_data = sample_data.to_backend(backend) + duplicates = duplicates.to_backend(backend) + if perform_shuffle: + # We shuffle the data to make sure that duplicates are not in the same partition + duplicates = duplicates.sample(frac=1, random_state=42).reset_index(drop=True) + + result = remove_duplicates( + left=sample_data, + duplicates=duplicates, + id_field="id", + group_field="group", + perform_shuffle=perform_shuffle, + ).to_backend("pandas") + + result = result.compute() + assert list(result.columns) == ["id", "text"] + if perform_shuffle: + # Since we've performed a shuffle, we know groups are collacated and there are 3 groups + # 1. 1 row from the first group of 30 + # 2. 9 rows from the 9 distinct groups + # 3. And 1 row from the last group which is not included in set of duplicates + assert len(result) == 1 + 9 + 1 + # The last 10 ids should be in the result, there would be one more from the first 30 + assert set(ids[30:]).issubset(set(result["id"].tolist())) + else: + # If we don't shuffle, we'de be left with 2 partitions both having rows from group 1 + assert len(result) == 2 + 9 + 1 + + +@pytest.mark.parametrize( + "backend", + [ + "pandas", + pytest.param("cudf", marks=pytest.mark.gpu), + ], +) +def test_remove_duplicates_raise_error( + backend: Literal["cudf", "pandas"], +): + # Create sample dataframes with specific partition counts + df1 = dd.from_pandas( + pd.DataFrame({"id": ["a1", "a2", "a3"], "text": ["text1", "text2", "text3"]}), + npartitions=2, + ) # dataset with 2 partitions + + duplicates = dd.from_pandas( + pd.DataFrame( + {"id": ["a1", "a2", "a3"], "group": ["group1", "group1", "group1"]} + ), + npartitions=3, + ) # duplicates dataset with 3 partitions + df1 = df1.to_backend(backend) + duplicates = duplicates.to_backend(backend) + + # Test that it raises ValueError when right npartitions are greater than left npartitions + with pytest.raises(ValueError) as exc_info: + remove_duplicates( + left=df1, + duplicates=duplicates, + id_field="id", + group_field="group", + ) + + expected_msg = ( + "The number of partitions in `left` is less than the number of partitions in the duplicates dataset. " + "This may lead to a shuffle join. Please re-read left and right with different partition sizes, or repartition left / right." + ) + assert str(exc_info.value) == expected_msg diff --git a/tests/test_exact_dedup.py b/tests/test_exact_dedup.py index d0408073..af2b0188 100644 --- a/tests/test_exact_dedup.py +++ b/tests/test_exact_dedup.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from hashlib import md5 + import pandas as pd import pytest from dask import dataframe as dd -from dask.dataframe.utils import assert_eq from nemo_curator.datasets import DocumentDataset from nemo_curator.modules import ExactDuplicates @@ -47,7 +48,29 @@ def test_dup(self, exact_dedup_data, cache_result, tmpdir): hash_method="md5", cache_dir=tmpdir if cache_result else None, ) - result = exact_dups(exact_dedup_data) - expected_df = exact_dedup_data.df.compute() - expected_df = expected_df[expected_df.text.duplicated(keep=False)] - assert_eq(result.df.id, expected_df.id, check_index=False) + duplicates = exact_dups.identify_duplicates(exact_dedup_data) + deduplicated_ds = exact_dups.remove(exact_dedup_data, duplicates) + deduplicated_ids_series = deduplicated_ds.df.to_backend("pandas").compute()[ + "id" + ] + output_deduplicated_ids = set(deduplicated_ids_series.tolist()) + assert ( + len(output_deduplicated_ids) == 3 + and 300 in output_deduplicated_ids + and len({-1, 1}.intersection(output_deduplicated_ids)) == 1 + and len({2, 4}.intersection(output_deduplicated_ids)) == 1 + ) + + duplicates_df = ( + duplicates.df.to_backend("pandas") + .compute() + .sort_values(by="id", ignore_index=True) + ) + expected_df = pd.DataFrame( + { + "id": [1, -1] + [2, 4], + "_hashes": [md5(b"abc").hexdigest()] * 2 + + [md5(b"aba").hexdigest()] * 2, + } + ).sort_values(by="id", ignore_index=True) + pd.testing.assert_frame_equal(duplicates_df, expected_df, check_like=True) diff --git a/tests/test_filters.py b/tests/test_filters.py index 2f8bd00d..aa15a1f0 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -19,6 +19,7 @@ import pandas as pd import pytest from dask import dataframe as dd +from dask.dataframe.utils import assert_eq from nemo_curator.datasets import DocumentDataset from nemo_curator.datasets.parallel_dataset import ParallelDataset @@ -49,7 +50,9 @@ RepeatedParagraphsFilter, RepeatingDuplicateNGramsFilter, RepeatingTopNGramsFilter, + SubstringFilter, SymbolsToWordsFilter, + TokenCountFilter, TokenizerFertilityFilter, UrlsFilter, WhiteSpaceFilter, @@ -110,6 +113,13 @@ def keep_document(self, scores): return min_threshold & max_threshold +# A simple dummy tokenizer for our tests. +class DummyTokenizer: + def encode(self, text): + # Simply splits the text on whitespace. + return text.split() + + def all_equal(left_dataset, right_dataset): return all(left_dataset.df.compute() == right_dataset.df.compute()) @@ -767,6 +777,176 @@ def test_histogram(self): ), f"Expected {expected_data2} but got {filtered_data2}" +class TestTokenCountFilter: + def test_score_document(self): + tokenizer = DummyTokenizer() + token_filter = TokenCountFilter(tokenizer, min_tokens=2, max_tokens=3) + text = "another test case" # Should yield 3 tokens. + score = token_filter.score_document(text) + assert score == 3 + + def test_keep_document(self): + tokenizer = DummyTokenizer() + token_filter = TokenCountFilter(tokenizer, min_tokens=2, max_tokens=3) + # Check that a score of 1 (too few) and 4 (too many) are rejected, + # while scores of 2 and 3 are accepted. + assert token_filter.keep_document(2) + assert token_filter.keep_document(3) + assert not token_filter.keep_document(1) + assert not token_filter.keep_document(4) + + def test_filter_dataset(self): + # Create a dataset of documents with different word counts. + docs = [ + "hello", # 1 token + "hello world", # 2 tokens + "this is a test", # 4 tokens + "another test case", # 3 tokens + ] + dataset = list_to_dataset(docs, col_name="text") + + tokenizer = DummyTokenizer() + token_filter = TokenCountFilter(tokenizer, min_tokens=2, max_tokens=3) + filter_step = ScoreFilter(token_filter, text_field="text") + filtered_dataset = filter_step(dataset) + # Reset indices for filtered dataset to ensure identical labeling for comparison. + filtered_dataset.df = filtered_dataset.df.reset_index(drop=True) + + # We expect to keep only the documents with exactly 2 or 3 tokens. + expected_docs = [ + "hello world", # 2 tokens + "another test case", # 3 tokens + ] + expected_dataset = list_to_dataset(expected_docs, col_name="text") + # Reset indices for expected dataset to ensure identical labeling. + expected_dataset.df = expected_dataset.df.reset_index(drop=True) + assert all_equal(expected_dataset, filtered_dataset) + + def test_filter_dataset_default(self): + # Create a dataset of documents with different word counts. + docs = [ + "hello", # 1 token + "hello world", # 2 tokens + "this is a test", # 4 tokens + "another test case", # 3 tokens + ] + dataset = list_to_dataset(docs, col_name="text") + + tokenizer = DummyTokenizer() + # Using default settings: min_tokens=0 and max_tokens=inf, so all documents pass. + token_filter = TokenCountFilter(tokenizer) + filter_step = ScoreFilter(token_filter, text_field="text") + filtered_dataset = filter_step(dataset) + + # We expect to keep all documents. + expected_dataset = list_to_dataset(docs, col_name="text") + assert all_equal(expected_dataset, filtered_dataset) + + +class TestSubstringFilter: + def test_invalid_position(self): + # Creating a SubstringFilter with an invalid position should raise a ValueError. + with pytest.raises(ValueError): + SubstringFilter("foo", "middle") + + def test_prefix_mode(self): + filter_prefix = SubstringFilter("Hello", "prefix") + # Positive example: text starts with "Hello". + text = "Hello world" + score = filter_prefix.score_document(text) + assert score == 1 + assert filter_prefix.keep_document(score) + # Negative example: text does not start with "Hello". + text2 = "world Hello" + score2 = filter_prefix.score_document(text2) + assert score2 == 0 + assert not filter_prefix.keep_document(score2) + + def test_suffix_mode(self): + filter_suffix = SubstringFilter("end", "suffix") + # Positive example: text ends with "end". + text = "This is the end" + score = filter_suffix.score_document(text) + assert score == 1 + assert filter_suffix.keep_document(score) + # Negative example: text does not end with "end". + text2 = "The end is near" + score2 = filter_suffix.score_document(text2) + assert score2 == 0 + assert not filter_suffix.keep_document(score2) + + def test_any_mode(self): + filter_any = SubstringFilter("test", "any") + # Positive example: text contains "test". + text = "this is a test string" + score = filter_any.score_document(text) + assert score == 1 + assert filter_any.keep_document(score) + # Negative example: text does not contain "test". + text2 = "this is a string" + score2 = filter_any.score_document(text2) + assert score2 == 0 + assert not filter_any.keep_document(score2) + + def test_filter_dataset_prefix(self): + docs = ["Hello world", "world Hello", "Hello everyone", "Not matching"] + dataset = list_to_dataset(docs, col_name="text") + filter_prefix = SubstringFilter("Hello", "prefix") + filter_step = ScoreFilter(filter_prefix, text_field="text") + filtered_dataset = filter_step(dataset) + + # Expect only those records where the text starts with "Hello". + expected_docs = ["Hello world", "Hello everyone"] + expected_dataset = list_to_dataset(expected_docs, col_name="text") + + # Reset indices to ensure both DataFrames are identically labeled + filtered_dataset = DocumentDataset(filtered_dataset.df.reset_index(drop=True)) + expected_dataset = DocumentDataset(expected_dataset.df.reset_index(drop=True)) + assert all_equal(expected_dataset, filtered_dataset) + + def test_filter_dataset_suffix(self): + docs = [ + "This is the end", # ends with "end" + "end of story", # does not end with "end" + "ending is good", # does not end with "end" + "Not matching end", # ends with "end" + "The end", # ends with "end" + ] + dataset = list_to_dataset(docs, col_name="text") + filter_suffix = SubstringFilter("end", "suffix") + filter_step = ScoreFilter(filter_suffix, text_field="text") + filtered_dataset = filter_step(dataset) + + # Expect only those records that end with "end". + expected_docs = [ + "Not matching end", + "The end", + "This is the end", + ] + expected_dataset = list_to_dataset(expected_docs, col_name="text") + + # Compare only the 'text' column values to avoid index label issues. + filtered_dataset = DocumentDataset(filtered_dataset.df.reset_index(drop=True)) + expected_dataset = DocumentDataset(expected_dataset.df.reset_index(drop=True)) + assert_eq(expected_dataset.df["text"], filtered_dataset.df["text"]) + + def test_filter_dataset_any(self): + docs = ["test case", "This is a testcase", "no match here", "another test"] + dataset = list_to_dataset(docs, col_name="text") + filter_any = SubstringFilter("test", "any") + filter_step = ScoreFilter(filter_any, text_field="text") + filtered_dataset = filter_step(dataset) + + # Expect documents that contain "test" anywhere. + expected_docs = ["test case", "This is a testcase", "another test"] + expected_dataset = list_to_dataset(expected_docs, col_name="text") + + # Reset indices to ensure both DataFrames are identically labeled + filtered_dataset = DocumentDataset(filtered_dataset.df.reset_index(drop=True)) + expected_dataset = DocumentDataset(expected_dataset.df.reset_index(drop=True)) + assert all_equal(expected_dataset, filtered_dataset) + + class TestCodeFilters: def test_python_comment_to_code(self): doc_1 = "# Good code\nprint('hello world')" diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py index 13f03766..81ed1cb1 100644 --- a/tests/test_fuzzy_dedup.py +++ b/tests/test_fuzzy_dedup.py @@ -22,16 +22,14 @@ import yaml from dask import config from dask.dataframe.utils import assert_eq -from distributed import Client from nemo_curator import LSH, FuzzyDuplicates, FuzzyDuplicatesConfig, MinHash from nemo_curator.datasets import DocumentDataset from nemo_curator.utils.fuzzy_dedup_utils.merge_utils import extract_partitioning_index -from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from +from nemo_curator.utils.import_utils import gpu_only_import cudf = gpu_only_import("cudf") dask_cudf = gpu_only_import("dask_cudf") -LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") @pytest.fixture @@ -303,13 +301,6 @@ def test_partial_overlap(self, tmpdir, false_positive_check): @pytest.mark.gpu class TestFuzzyDuplicates: - @pytest.fixture(autouse=True, scope="class") - def gpu_client(self, request): - with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: - request.cls.client = client - request.cls.cluster = cluster - yield - @pytest.mark.parametrize("use_64_bit_hash", [False, True]) @pytest.mark.parametrize( "num_buckets,jaccard_threshold,duplicate_docs", @@ -328,8 +319,9 @@ def test_fuzzy_dedup( jaccard_threshold, duplicate_docs, tmpdir, + gpu_client, ): - print(self.client) + print(gpu_client) # Dedup might fail when indices per partition do not start from 0 fuzzy_dedup_data.df = fuzzy_dedup_data.df.reset_index(drop=True) config = FuzzyDuplicatesConfig( @@ -347,7 +339,7 @@ def test_fuzzy_dedup( jaccard_threshold=jaccard_threshold, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(fuzzy_dedup_data) + result = fuzzy_duplicates.identify_duplicates(fuzzy_dedup_data) result_df = result.df.compute() # Drop non duplicated docs result_df = result_df[result_df.group.duplicated(keep=False)] @@ -378,27 +370,39 @@ def test_different_fields(self, fuzzy_dedup_data, tmpdir): char_ngrams=5, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(fuzzy_dedup_data) - result_df = result.df.compute() + duplicates = fuzzy_duplicates.identify_duplicates(fuzzy_dedup_data) + deduplicated_ds = fuzzy_duplicates.remove(fuzzy_dedup_data, duplicates) + deduplicated_df = deduplicated_ds.df.compute() + output_deduplicated_ids = set(deduplicated_df["col0"].to_arrow().to_pylist()) + assert len(deduplicated_df) == 3 + # From each of our groups we'll have atmost one document that is not duplicated + assert ( + 300 in output_deduplicated_ids + and len({-1, 4}.intersection(output_deduplicated_ids)) == 1 + and len({1, 2}.intersection(output_deduplicated_ids)) == 1 + ) + # Drop non duplicated docs - result_df = result_df[result_df.group.duplicated(keep=False)] - result_df = result_df.groupby("group")["col0"].agg(list) + duplicates_df = duplicates.df.compute() + duplicates_df = duplicates_df[duplicates_df.group.duplicated(keep=False)] + duplicates_df = duplicates_df.groupby("group")["col0"].agg(list) # Sort to maintain uniform ordering - result_df = result_df.list.sort_values() - result_df = result_df.sort_values() + duplicates_df = duplicates_df.list.sort_values() + duplicates_df = duplicates_df.sort_values() duplicate_docs = [[4, -1], [1, 2]] expected_df = cudf.Series(duplicate_docs, name="col0") expected_df = expected_df.list.sort_values() expected_df = expected_df.sort_values() - assert_eq(expected_df, result_df, check_index=False) + assert_eq(expected_df, duplicates_df, check_index=False) @pytest.mark.xfail def test_non_uniform_indices( self, tmpdir, + gpu_client, ): - print(self.client) + print(gpu_client) # Dedup might fail when indices per partition do not start from 0 df = cudf.DataFrame( { @@ -430,19 +434,29 @@ def test_non_uniform_indices( jaccard_threshold=0.39, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(data) - result_df = result.df.compute() + duplicates = fuzzy_duplicates.identify_duplicates(data) + deduplicated_ds = fuzzy_duplicates.remove(fuzzy_dedup_data, duplicates) + deduplicated_df = deduplicated_ds.df.compute() + output_deduplicated_ids = set(deduplicated_df["col0"].to_arrow().to_pylist()) + assert len(deduplicated_df) == 2 + # From each of our groups we'll have atmost one document that is not duplicated + assert ( + len({4, -1}.intersection(output_deduplicated_ids)) == 1 + and len({1, 2, 300}.intersection(output_deduplicated_ids)) == 1 + ) + + duplicates_df = duplicates.df.compute() # Drop non duplicated docs - result_df = result_df[result_df.group.duplicated(keep=False)] - result_df = result_df.groupby("group").id.agg(list) + duplicates_df = duplicates_df[duplicates_df.group.duplicated(keep=False)] + duplicates_df = duplicates_df.groupby("group").id.agg(list) # Sort to maintain uniform ordering - result_df = result_df.list.sort_values() - result_df = result_df.sort_values() + duplicates_df = duplicates_df.list.sort_values() + duplicates_df = duplicates_df.sort_values() expected_df = cudf.Series(duplicate_docs, name="id") expected_df = expected_df.list.sort_values() expected_df = expected_df.sort_values() - assert_eq(expected_df, result_df, check_index=False) + assert_eq(expected_df, duplicates_df, check_index=False) @pytest.mark.parametrize("num_anchors", [1, 3, 10]) def test_num_anchors(self, large_fuzzy_dedup_data, num_anchors, tmpdir): @@ -477,7 +491,13 @@ def test_num_anchors(self, large_fuzzy_dedup_data, num_anchors, tmpdir): ], ) def test_no_fp_check( - self, fuzzy_dedup_data, use_64_bit_hash, num_buckets, duplicate_docs, tmpdir + self, + fuzzy_dedup_data, + use_64_bit_hash, + num_buckets, + duplicate_docs, + tmpdir, + gpu_client, ): config = FuzzyDuplicatesConfig( cache_dir=tmpdir, @@ -494,7 +514,7 @@ def test_no_fp_check( jaccard_threshold=0.39, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(fuzzy_dedup_data) + result = fuzzy_duplicates.identify_duplicates(fuzzy_dedup_data) result_df = result.df.compute() # Drop non duplicated docs result_df = result_df[result_df.group.duplicated(keep=False)] @@ -512,6 +532,7 @@ def test_shuffle_fail_fuzzy_dedup_data( self, shuffle_fail_fuzzy_dedup_data, tmpdir, + gpu_client, ): # Dedup might fail when indices per partition do not start from 0 shuffle_fail_fuzzy_dedup_data.df = shuffle_fail_fuzzy_dedup_data.df.reset_index( @@ -532,7 +553,7 @@ def test_shuffle_fail_fuzzy_dedup_data( jaccard_threshold=0.39, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(shuffle_fail_fuzzy_dedup_data) + result = fuzzy_duplicates.identify_duplicates(shuffle_fail_fuzzy_dedup_data) result_df = result.df.compute() # Drop non duplicated docs result_df = result_df[result_df.group.duplicated(keep=False)] @@ -548,7 +569,7 @@ def test_shuffle_fail_fuzzy_dedup_data( @pytest.mark.parametrize("false_positive_check", [True, False]) def test_fuzzy_dedup_no_duplicates( - self, no_duplicates_fuzzy_dedup_data, tmpdir, false_positive_check + self, no_duplicates_fuzzy_dedup_data, tmpdir, false_positive_check, gpu_client ): # Dedup might fail when indices per partition do not start from 0 no_duplicates_fuzzy_dedup_data.df = ( @@ -569,7 +590,7 @@ def test_fuzzy_dedup_no_duplicates( jaccard_threshold=0.39, ) fuzzy_duplicates = FuzzyDuplicates(config=config) - result = fuzzy_duplicates(no_duplicates_fuzzy_dedup_data) + result = fuzzy_duplicates.identify_duplicates(no_duplicates_fuzzy_dedup_data) assert result is None diff --git a/tests/test_joiner.py b/tests/test_joiner.py new file mode 100644 index 00000000..844beb1c --- /dev/null +++ b/tests/test_joiner.py @@ -0,0 +1,176 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +from dask.dataframe.utils import assert_eq + +from nemo_curator import DocumentJoiner +from nemo_curator.datasets import DocumentDataset + + +class TestDocumentJoiner: + def test_join_default(self): + # Input represents documents already split. + # For example, a document with id=1 split as "a", "b", "c" becomes joined to "a|b|c". + # Four documents are used. + data = { + "id": [1, 1, 1, 2, 3, 3, 4, 4], + "text": ["a", "b", "c", "nosplit", "start", "middle", "end", ""], + "segment_id": [0, 1, 2, 0, 0, 1, 0, 1], + } + pdf = pd.DataFrame(data) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + joiner = DocumentJoiner( + separator="|", + text_field="text", + segment_id_field="segment_id", + document_id_field="id", + drop_segment_id_field=True, + ) + result_dataset = joiner(dataset) + + expected_df = pd.DataFrame( + {"id": [1, 2, 3, 4], "text": ["a|b|c", "nosplit", "start|middle", "end|"]} + ) + assert_eq( + result_dataset.df.compute().reset_index(drop=True), + expected_df, + check_index=False, + ) + + def test_join_custom_fields(self): + # Use custom field names: + # document id field: "doc" + # text field: "content" + # segment id field: "s_id" + # Also keep the segment id field (drop_segment_id_field=False) + data = { + "doc": [101, 101, 102, 103, 103, 104, 104], + "content": ["first", "second", "only", "hello", "world", "baz", ""], + "s_id": [0, 1, 0, 0, 1, 0, 1], + } + pdf = pd.DataFrame(data) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + joiner = DocumentJoiner( + separator="~", + text_field="content", + segment_id_field="s_id", + document_id_field="doc", + drop_segment_id_field=False, + ) + result_dataset = joiner(dataset) + + # Expected: each document is joined by "~". The segment id becomes the first segment's id. + expected_df = pd.DataFrame( + { + "doc": [101, 102, 103, 104], + "content": ["first~second", "only", "hello~world", "baz~"], + "s_id": [0, 0, 0, 0], + } + ) + assert_eq( + result_dataset.df.compute().reset_index(drop=True), + expected_df, + check_index=False, + ) + + def test_join_max_length(self): + # Here we test joining when a maximum length is specified. + # Each segment carries a precomputed "length" value. + # The joiner should accumulate segments until adding the next one (plus separator) + # would exceed max_length=5. + # + # For document 1: + # segments: "ab"(2), "cd"(2), "ef"(2), "gh"(2) + # - "ab" then "cd": 2+2+1 = 5 → join as "ab-cd" (length 5) + # - then "ef" then "gh": 2+2+1 = 5 → join as "ef-gh" (length 5) + # + # For document 2: + # segments: "a"(1), "b"(1) → join as "a-b" (length 3) + # + # For document 3: + # segment: "hello"(5) → remains "hello" + # + # For document 4: + # segments: "x"(1), "yz"(2), "0"(1) + # - "x" then "yz": 1+2+1 = 4 → "x-yz" (length 4) + # - "0" remains alone. + data = { + "id": [1, 1, 1, 1, 2, 2, 3, 4, 4, 4], + "text": ["ab", "cd", "ef", "gh", "a", "b", "hello", "x", "yz", "0"], + "segment_id": [0, 1, 2, 3, 0, 1, 0, 0, 1, 2], + "length": [2, 2, 2, 2, 1, 1, 5, 1, 2, 1], + } + pdf = pd.DataFrame(data) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + joiner = DocumentJoiner( + separator="-", + text_field="text", + segment_id_field="segment_id", + document_id_field="id", + drop_segment_id_field=True, + max_length=5, + length_field="length", + ) + result_dataset = joiner(dataset) + + expected_df = pd.DataFrame( + [ + {"id": 1, "text": "ab-cd", "length": 5}, + {"id": 1, "text": "ef-gh", "length": 5}, + {"id": 2, "text": "a-b", "length": 3}, + {"id": 3, "text": "hello", "length": 5}, + {"id": 4, "text": "x-yz", "length": 4}, + {"id": 4, "text": "0", "length": 1}, + ] + ) + # Sort by id and text to ensure consistent order + expected_sorted = expected_df.sort_values(by=["id", "text"]).reset_index( + drop=True + ) + result_sorted = ( + result_dataset.df.compute() + .sort_values(by=["id", "text"]) + .reset_index(drop=True) + ) + assert_eq(result_sorted, expected_sorted, check_index=False) + + def test_join_with_string_ids(self): + # Test join functionality when document id field is a string. + data = { + "doc": ["doc1", "doc1", "doc2", "doc3", "doc3", "doc4", "doc4"], + "text": ["a", "b", "nosplit", "start", "middle", "end", ""], + "segment_id": [0, 1, 0, 0, 1, 0, 1], + } + pdf = pd.DataFrame(data) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + joiner = DocumentJoiner( + separator="|", + text_field="text", + segment_id_field="segment_id", + document_id_field="doc", + drop_segment_id_field=True, + ) + result_dataset = joiner(dataset) + + expected_df = pd.DataFrame( + { + "doc": ["doc1", "doc2", "doc3", "doc4"], + "text": ["a|b", "nosplit", "start|middle", "end|"], + } + ) + assert_eq( + result_dataset.df.compute().reset_index(drop=True), + expected_df, + check_index=False, + ) diff --git a/tests/test_nemotron_cc.py b/tests/test_nemotron_cc.py new file mode 100644 index 00000000..a267779e --- /dev/null +++ b/tests/test_nemotron_cc.py @@ -0,0 +1,287 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random + +import dask.dataframe as dd +import pandas as pd +import pytest + +from nemo_curator.datasets import DocumentDataset +from nemo_curator.synthetic.nemotron_cc import ( + NemotronCCDiverseQAPostprocessor, + NemotronCCKnowledgeListPostprocessor, +) + + +# A dummy tokenizer that simply splits text by whitespace. +class DummyTokenizer: + def tokenize(self, text): + return text.split() + + +# Helper function to create a DocumentDataset from provided data. +def create_dataset(data): + pdf = pd.DataFrame(data) + return DocumentDataset.from_pandas(pdf) + + +class TestDiverseQAPostprocessor: + def test_valid_response_without_tokenizer(self, monkeypatch): + # Patch randomness so that the ordering and sampling is deterministic. + monkeypatch.setattr(random, "shuffle", lambda x: None) + # In the branch without a tokenizer, random.randint(1, max_num_pairs) + # will be forced to return the upper bound. + monkeypatch.setattr(random, "randint", lambda lo, hi: hi) + + text = "Document text" + llm_response = ( + "Here are the questions and answers based on the provided text:\n" + "- Question: What is this?\n" + "Answer: It is a test.\n" + "- Question: How does it work?\n" + "Answer: By magic." + ) + # Create a dataset with one row containing both the document and the LLM response. + ds = create_dataset({"text": [text], "response": [llm_response]}) + + # Use no tokenizer so that the branch using max_num_pairs (here, 2) is used. + processor = NemotronCCDiverseQAPostprocessor(tokenizer=None, max_num_pairs=2) + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # Expected processing: + # 1. Split into lines and remove the leading "- " prefix. + # 2. Remove the prefix line ("Here are...") if it matches. + # 3. Merge lines: the first QA pair becomes: + # "Question: What is this?\nAnswer: It is a test." + # and the second: + # "Question: How does it work?\nAnswer: By magic." + # 4. With our patched randint, both QA pairs are kept. + expected_qa = ( + "Question: What is this?\nAnswer: It is a test.\n\n" + "Question: How does it work?\nAnswer: By magic." + ) + expected_response = f"{text}\n\n{expected_qa}" + + assert not result_df.empty, "Expected non-empty dataset" + actual_response = result_df.iloc[0]["response"] + assert ( + actual_response == expected_response + ), f"Expected: {expected_response}, got: {actual_response}" + + def test_valid_response_with_tokenizer(self, monkeypatch): + # Using a dummy tokenizer. + dummy_tokenizer = DummyTokenizer() + monkeypatch.setattr(random, "shuffle", lambda x: None) + # For the branch with a tokenizer, the number of tokens is determined by: + # num_tokens = len(dummy_tokenizer.tokenize(text)). For "Document text" this yields 2. + # Then max_num = max(1, int(max_num_pairs * num_tokens / 150)) becomes max(1, int(4/150)) -> 1. + monkeypatch.setattr(random, "randint", lambda lo, hi: hi) + + text = "Document text" + llm_response = ( + "Here are the questions and answers based on the provided text:\n" + "- Question: What is this?\n" + "Answer: It is a test.\n" + "- Question: How does it work?\n" + "Answer: By magic." + ) + ds = create_dataset({"text": [text], "response": [llm_response]}) + processor = NemotronCCDiverseQAPostprocessor( + tokenizer=dummy_tokenizer, max_num_pairs=2 + ) + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # In the tokenizer branch only one QA pair is selected (the first one). + expected_qa = "Question: What is this?\nAnswer: It is a test." + expected_response = f"{text}\n\n{expected_qa}" + + assert not result_df.empty, "Expected non-empty dataset" + actual_response = result_df.iloc[0]["response"] + assert ( + actual_response == expected_response + ), f"Expected: {expected_response}, got: {actual_response}" + + def test_invalid_response_format(self, monkeypatch): + # Test a response with an invalid QA format (missing a "Question:" line). + monkeypatch.setattr(random, "shuffle", lambda x: None) + monkeypatch.setattr(random, "randint", lambda lo, hi: hi) + + text = "Doc" + # The response only has an answer line. + llm_response = ( + "Here are the questions and answers based on the provided text:\n" + "- Answer: Missing question." + ) + ds = create_dataset({"text": [text], "response": [llm_response]}) + processor = NemotronCCDiverseQAPostprocessor(tokenizer=None, max_num_pairs=2) + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # Since the response format is invalid (no "Question:" to start a QA pair), + # the postprocessing should return an empty string; the __call__ method then + # drops that row. + assert ( + result_df.empty + ), "Expected dataset to be empty due to invalid response format" + + def test_empty_response(self): + # Test when the LLM response is empty. + text = "Doc" + llm_response = "" + ds = create_dataset({"text": [text], "response": [llm_response]}) + processor = NemotronCCDiverseQAPostprocessor(tokenizer=None, max_num_pairs=2) + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # The empty LLM response should lead to an empty processed text and get filtered out. + assert result_df.empty, "Expected dataset to be empty for an empty LLM response" + + def test_more_qa_than_max(self, monkeypatch): + # Test when there are more QA pairs than max_num_pairs. + monkeypatch.setattr(random, "shuffle", lambda x: None) + monkeypatch.setattr(random, "randint", lambda lo, hi: hi) + + text = "Document text" + llm_response = ( + "Here are the questions and answers based on the provided text:\n" + "- Question: Q1?\n" + "Answer: A1.\n" + "- Question: Q2?\n" + "Answer: A2.\n" + "- Question: Q3?\n" + "Answer: A3.\n" + "- Question: Q4?\n" + "Answer: A4." + ) + ds = create_dataset({"text": [text], "response": [llm_response]}) + processor = NemotronCCDiverseQAPostprocessor(tokenizer=None, max_num_pairs=2) + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # With max_num_pairs set to 2 and patched randint returning the upper bound, + # only the first two QA pairs should be selected. + expected_qa = "Question: Q1?\nAnswer: A1.\n\n" "Question: Q2?\nAnswer: A2." + expected_response = f"{text}\n\n{expected_qa}" + + assert not result_df.empty, "Expected non-empty dataset" + actual_response = result_df.iloc[0]["response"] + assert ( + actual_response == expected_response + ), f"Expected: {expected_response}, got: {actual_response}" + + +class TestKnowledgeListPostprocessor: + def test_basic_formatting(self): + # Test that a response with an initial non-bullet line (to skip) and bullet lines + # is correctly cleaned. + input_response = ( + "Not a bullet line to skip\n" + "- Fact one: This is the first fact.\n" + " Continued fact one.\n" + "- Fact two: This is the second fact." + ) + ds = create_dataset({"text": [input_response]}) + processor = NemotronCCKnowledgeListPostprocessor(text_field="text") + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # Expected: + # - First line is skipped (since it does not start with "-"). + # - Bullet lines have the leading "- " or " " removed. + expected_output = ( + "Fact one: This is the first fact.\n" + "Continued fact one.\n" + "Fact two: This is the second fact." + ) + actual_output = result_df.iloc[0]["text"] + assert ( + actual_output == expected_output + ), f"Expected: {expected_output}, got: {actual_output}" + + def test_all_bullet_lines(self): + # Test when every line starts with a bullet prefix. + input_response = "- Item one\n" "- Item two\n" "- Item three" + ds = create_dataset({"text": [input_response]}) + processor = NemotronCCKnowledgeListPostprocessor(text_field="text") + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # Each line should be cleaned by removing the leading bullet. + expected_output = "Item one\nItem two\nItem three" + actual_output = result_df.iloc[0]["text"] + assert ( + actual_output == expected_output + ), f"Expected: {expected_output}, got: {actual_output}" + + def test_no_bullet_lines(self): + # If the response contains no bullet lines, then the first line is + # skipped and no text remains. + input_response = "This is just plain text without any bullet." + ds = create_dataset({"text": [input_response]}) + processor = NemotronCCKnowledgeListPostprocessor(text_field="text") + result_ds = processor(ds) + result_df = result_ds.df.compute() + + expected_output = "" + actual_output = result_df.iloc[0]["text"] + assert ( + actual_output == expected_output + ), f"Expected an empty string, got: {actual_output}" + + def test_mixed_indentation(self): + # Test mixed bullet prefixes and additional non-bullet lines. + input_response = ( + "- Bullet one\n" + "Some extra text\n" + " Indented line\n" + "- Bullet two\n" + " Continuation of bullet two\n" + "Another standalone line" + ) + ds = create_dataset({"text": [input_response]}) + processor = NemotronCCKnowledgeListPostprocessor(text_field="text") + result_ds = processor(ds) + result_df = result_ds.df.compute() + + # Note: Only the very first line is conditionally skipped if it doesn't start with '-'. + # Here, since the first line starts with "-", nothing is skipped. + # Each line that starts with "- " or " " should have those two characters removed. + expected_output = ( + "Bullet one\n" + "Some extra text\n" + "Indented line\n" + "Bullet two\n" + "Continuation of bullet two\n" + "Another standalone line" + ) + actual_output = result_df.iloc[0]["text"] + assert ( + actual_output == expected_output + ), f"Expected: {expected_output}, got: {actual_output}" + + def test_empty_input(self): + # Test that an empty input returns an empty string. + input_response = "" + ds = create_dataset({"text": [input_response]}) + processor = NemotronCCKnowledgeListPostprocessor(text_field="text") + result_ds = processor(ds) + result_df = result_ds.df.compute() + + expected_output = "" + actual_output = result_df.iloc[0]["text"] + assert ( + actual_output == expected_output + ), f"Expected empty string, got: {actual_output}" diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py index 5646311f..e35f0f5d 100644 --- a/tests/test_semdedup.py +++ b/tests/test_semdedup.py @@ -27,7 +27,6 @@ cudf = gpu_only_import("cudf") dask_cudf = gpu_only_import("dask_cudf") -LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster") EmbeddingCreator = gpu_only_import_from( "nemo_curator.modules.semantic_dedup.embeddings", "EmbeddingCreator" ) @@ -55,19 +54,13 @@ def dedup_data(): @pytest.mark.gpu class TestSemDuplicates: - @pytest.fixture(autouse=True, scope="class") - def gpu_client(self, request): - with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client: - request.cls.client = client - request.cls.cluster = cluster - yield - def test_sem_dedup( self, dedup_data, tmpdir, + gpu_client, ): - print("client", self.client) + print("client", gpu_client) cache_dir = os.path.join(tmpdir, "test_sem_dedup_cache") config = SemDedupConfig( cache_dir=cache_dir, diff --git a/tests/test_splitter.py b/tests/test_splitter.py new file mode 100644 index 00000000..6d987b86 --- /dev/null +++ b/tests/test_splitter.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pytest +from dask.dataframe.utils import assert_eq + +from nemo_curator import DocumentSplitter, ToBackend +from nemo_curator.datasets import DocumentDataset + + +class TestDocumentSplitter: + @pytest.mark.parametrize( + "backend", ["pandas", pytest.param("cudf", marks=pytest.mark.gpu)] + ) + def test_basic_split_default(self, backend): + # Use default text_field "text" and segment_id_field "segment_id" + # Four examples: + # "a|b|c" → splits to ["a", "b", "c"] + # "nosplit" → ["nosplit"] + # "start|middle" → ["start", "middle"] + # "end|" → ["end", ""] + docs = ["a|b|c", "nosplit", "start|middle", "end|"] + pdf = pd.DataFrame({"text": docs}) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + to_backend = ToBackend(backend) + dataset = to_backend(dataset) + + splitter = DocumentSplitter(separator="|") + result_dataset = splitter(dataset) + + result_df = result_dataset.df.compute() + if backend == "cudf": + result_df = result_df.to_pandas() + + expected_df = pd.DataFrame( + { + "text": ["a", "b", "c", "nosplit", "start", "middle", "end", ""], + "segment_id": [0, 1, 2, 0, 0, 1, 0, 1], + } + ) + # Compare without considering the index order. + assert_eq( + result_df.reset_index(drop=True), + expected_df.reset_index(drop=True), + check_index=False, + ) + + @pytest.mark.parametrize( + "backend", ["pandas", pytest.param("cudf", marks=pytest.mark.gpu)] + ) + def test_split_custom_fields(self, backend): + # Use a custom text field name ("content") and segment id field ("seg_id") + # with a different separator. + # Examples: + # "x;y" → ["x", "y"] + # "single" → ["single"] + # "first;second;third" → ["first", "second", "third"] + # ";leading" → ["", "leading"] + docs = ["x;y", "single", "first;second;third", ";leading"] + pdf = pd.DataFrame({"content": docs}) + dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + to_backend = ToBackend(backend) + dataset = to_backend(dataset) + + splitter = DocumentSplitter( + separator=";", text_field="content", segment_id_field="seg_id" + ) + result_dataset = splitter(dataset) + + result_df = result_dataset.df.compute() + if backend == "cudf": + result_df = result_df.to_pandas() + + expected_df = pd.DataFrame( + { + "content": [ + "x", + "y", + "single", + "first", + "second", + "third", + "", + "leading", + ], + "seg_id": [0, 1, 0, 0, 1, 2, 0, 1], + } + ) + assert_eq( + result_df.reset_index(drop=True), + expected_df.reset_index(drop=True), + check_index=False, + ) diff --git a/tests/test_splitter_joiner.py b/tests/test_splitter_joiner.py new file mode 100644 index 00000000..9b7b5cec --- /dev/null +++ b/tests/test_splitter_joiner.py @@ -0,0 +1,56 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +from dask.dataframe.utils import assert_eq + +from nemo_curator import DocumentJoiner, DocumentSplitter +from nemo_curator.datasets import DocumentDataset + + +class TestSplitJoinReconstruction: + def test_reconstruction_default(self): + # Create an original dataset with a unique "id" column and text examples. + # Four examples include edge cases: + # "a|b|c" → multiple splits + # "nosplit" → no separator present + # "a||b|" → consecutive separators yield empty strings + # "" → empty document + docs = ["a|b|c", "nosplit", "a||b|", ""] + pdf = pd.DataFrame({"id": [1, 2, 3, 4], "text": docs}) + original_dataset = DocumentDataset.from_pandas(pdf, npartitions=1) + + # First, split using "|" as separator. + splitter = DocumentSplitter(separator="|") + split_dataset = splitter(original_dataset) + + # Then, rejoin using the same separator. + joiner = DocumentJoiner( + separator="|", + text_field="text", + segment_id_field="segment_id", + document_id_field="id", + drop_segment_id_field=True, + ) + reconstructed_dataset = joiner(split_dataset) + + # The reconstructed "text" column should match the original. + original_sorted = ( + original_dataset.df.compute().sort_values(by="id").reset_index(drop=True) + ) + reconstructed_sorted = ( + reconstructed_dataset.df.compute() + .sort_values(by="id") + .reset_index(drop=True) + ) + assert_eq(reconstructed_sorted, original_sorted, check_index=False) diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index 86b17b50..ab1b00a0 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -300,14 +300,12 @@ def fuzzy_dedupe(dataset: DocumentDataset, cache: str) -> DocumentDataset: id_field="id", text_field="text", seed=42, - char_ngrams=20, + char_ngrams=24, num_buckets=20, hashes_per_bucket=13, use_64_bit_hash=False, buckets_per_shuffle=5, false_positive_check=False, - num_anchors=2, - jaccard_threshold=0.8, ) fuzzy_dup = FuzzyDuplicates(config=fuzzy_dedup_config) duplicates = fuzzy_dup(dataset) diff --git a/tutorials/distributed_data_classification/README.md b/tutorials/distributed_data_classification/README.md index f953d8f5..e5e1f9b1 100644 --- a/tutorials/distributed_data_classification/README.md +++ b/tutorials/distributed_data_classification/README.md @@ -18,6 +18,8 @@ Before running any of these notebooks, please see this [Getting Started](https:/ | `ContentTypeClassifier` | [nvidia/content-type-classifier-deberta](https://huggingface.co/nvidia/content-type-classifier-deberta) | | `DomainClassifier` | [nvidia/domain-classifier](https://huggingface.co/nvidia/domain-classifier) | | `FineWebEduClassifier` | [HuggingFaceFW/fineweb-edu-classifier](https://huggingface.co/HuggingFaceFW/fineweb-edu-classifier) | +| `FineWebMixtralEduClassifier` | [nvidia/nemocurator-fineweb-mixtral-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) | +| `FineWebNemotronEduClassifier` | [nvidia/nemocurator-fineweb-nemotron-4-edu-classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) | | `InstructionDataGuardClassifier` | [nvidia/instruction-data-guard](https://huggingface.co/nvidia/instruction-data-guard) | | `MultilingualDomainClassifier` | [nvidia/multilingual-domain-classifier](https://huggingface.co/nvidia/multilingual-domain-classifier) | | `PromptTaskComplexityClassifier` | [nvidia/prompt-task-and-complexity-classifier](https://huggingface.co/nvidia/prompt-task-and-complexity-classifier) | diff --git a/tutorials/distributed_data_classification/fineweb-mixtral-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-mixtral-edu-classification.ipynb new file mode 100644 index 00000000..ed15455a --- /dev/null +++ b/tutorials/distributed_data_classification/fineweb-mixtral-edu-classification.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `FineWebMixtralEduClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `FineWebMixtralEduClassifier`. The [FineWeb Mixtral Edu classifier](https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier) is used to determine the educational value (score 0-5 from low to high) of a text. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the NemoCurator FineWeb Mixtral Edu Classifier, including its output labels, here: https://huggingface.co/nvidia/nemocurator-fineweb-mixtral-edu-classifier.\n", + "\n", + "The FineWeb Mixtral Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import FineWebMixtralEduClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = FineWebMixtralEduClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting FineWeb Mixtral Edu Classifier inference\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34037, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 4.80it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 1.16 s, sys: 1.17 s, total: 2.33 s\n", + "Wall time: 15.3 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:34037, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.54it/s]\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files with blocksize='1gb' / files_per_partition=None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fineweb-mixtral-edu-scorefineweb-mixtral-edu-score-intfineweb-mixtral-edu-score-labeltext
01.3525391low_qualityQuantum computing is set to revolutionize the ...
10.8291021low_qualityInvesting in index funds is a popular strategy...
21.4228521low_qualityRecent advancements in gene therapy offer new ...
31.5791022low_qualityOnline learning platforms have transformed the...
40.3461910low_qualityTraveling to Europe during the off-season can ...
\n", + "
" + ], + "text/plain": [ + " fineweb-mixtral-edu-score fineweb-mixtral-edu-score-int \\\n", + "0 1.352539 1 \n", + "1 0.829102 1 \n", + "2 1.422852 1 \n", + "3 1.579102 2 \n", + "4 0.346191 0 \n", + "\n", + " fineweb-mixtral-edu-score-label \\\n", + "0 low_quality \n", + "1 low_quality \n", + "2 low_quality \n", + "3 low_quality \n", + "4 low_quality \n", + "\n", + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/distributed_data_classification/fineweb-nemotron-edu-classification.ipynb b/tutorials/distributed_data_classification/fineweb-nemotron-edu-classification.ipynb new file mode 100644 index 00000000..4c160da5 --- /dev/null +++ b/tutorials/distributed_data_classification/fineweb-nemotron-edu-classification.ipynb @@ -0,0 +1,323 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Classification with NeMo Curator's `FineWebNemotronEduClassifier`\n", + "\n", + "This notebook demonstrates the use of NeMo Curator's `FineWebNemotronEduClassifier`. The [FineWeb Nemotron-4 Edu classifier](https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier) is used to determine the educational value (score 0-5 from low to high) of a text. It helps with data annotation, which is useful in data blending for foundation model training. Please refer to the Hugging Face page for more information about the NemoCurator FineWeb Nemotron-4 Edu Classifier, including its output labels, here: https://huggingface.co/nvidia/nemocurator-fineweb-nemotron-4-edu-classifier.\n", + "\n", + "The FineWeb Nemotron-4 Edu classifier is accelerated using [CrossFit](https://github.com/rapidsai/crossfit), a library that leverages intellegent batching and RAPIDS to accelerate the offline inference on large datasets.\n", + "\n", + "Before running this notebook, please see this [Getting Started](https://github.com/NVIDIA/NeMo-Curator?tab=readme-ov-file#get-started) page for instructions on how to install NeMo Curator." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: PYTHONWARNINGS=ignore\n" + ] + } + ], + "source": [ + "# Silence Warnings (HuggingFace internal warnings)\n", + "\n", + "%env PYTHONWARNINGS=ignore\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from nemo_curator import get_client\n", + "from nemo_curator.classifiers import FineWebNemotronEduClassifier\n", + "from nemo_curator.datasets import DocumentDataset\n", + "import cudf\n", + "import dask_cudf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cuDF Spilling is enabled\n" + ] + } + ], + "source": [ + "client = get_client(cluster_type=\"gpu\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set Output File Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_file_path = \"output_data_dir/\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Prepare Text Data and Initialize Classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample DataFrame\n", + "text = [\n", + " \"Quantum computing is set to revolutionize the field of cryptography.\",\n", + " \"Investing in index funds is a popular strategy for long-term financial growth.\",\n", + " \"Recent advancements in gene therapy offer new hope for treating genetic disorders.\",\n", + " \"Online learning platforms have transformed the way students access educational resources.\",\n", + " \"Traveling to Europe during the off-season can be a more budget-friendly option.\",\n", + " \"Training regimens for athletes have become more sophisticated with the use of data analytics.\",\n", + " \"Streaming services are changing the way people consume television and film content.\",\n", + " \"Vegan recipes have gained popularity as more people adopt plant-based diets.\",\n", + " \"Climate change research is critical for developing sustainable environmental policies.\",\n", + " \"Telemedicine has become increasingly popular due to its convenience and accessibility.\",\n", + "]\n", + "df = cudf.DataFrame({\"text\": text})\n", + "input_dataset = DocumentDataset(dask_cudf.from_cudf(df, npartitions=1))\n", + "write_to_filename = False\n", + "\n", + "# Alternatively, read existing directory of JSONL files\n", + "# input_file_path=\"/input_data_dir/\"\n", + "# input_dataset = DocumentDataset.read_json(\n", + "# input_file_path, backend=\"cudf\", add_filename=True\n", + "# )\n", + "# write_to_filename = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = FineWebNemotronEduClassifier(batch_size=1024)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Run the Classifier\n", + "\n", + "Dask operations are lazy, so the the classifier will not run until we call an eager operation like `to_json`, `compute`, or `persist`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting FineWeb Nemotron-4 Edu Classifier inference\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:33569, Part: 0: 100%|██████████| 10/10 [00:01<00:00, 5.04it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing to disk complete for 1 partition(s)\n", + "CPU times: user 1.35 s, sys: 172 ms, total: 1.52 s\n", + "Wall time: 14.8 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU: tcp://127.0.0.1:33569, Part: 0: 100%|██████████| 10/10 [00:02<00:00, 3.73it/s]\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "result_dataset = classifier(dataset=input_dataset)\n", + "result_dataset.to_json(output_path=output_file_path, write_to_filename=write_to_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inspect the Output" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading 1 files with blocksize='1gb' / files_per_partition=None\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fineweb-nemotron-edu-scorefineweb-nemotron-edu-score-intfineweb-nemotron-edu-score-labeltext
01.3925781low_qualityQuantum computing is set to revolutionize the ...
10.8896481low_qualityInvesting in index funds is a popular strategy...
21.3437501low_qualityRecent advancements in gene therapy offer new ...
31.7314452low_qualityOnline learning platforms have transformed the...
40.2485350low_qualityTraveling to Europe during the off-season can ...
\n", + "
" + ], + "text/plain": [ + " fineweb-nemotron-edu-score fineweb-nemotron-edu-score-int \\\n", + "0 1.392578 1 \n", + "1 0.889648 1 \n", + "2 1.343750 1 \n", + "3 1.731445 2 \n", + "4 0.248535 0 \n", + "\n", + " fineweb-nemotron-edu-score-label \\\n", + "0 low_quality \n", + "1 low_quality \n", + "2 low_quality \n", + "3 low_quality \n", + "4 low_quality \n", + "\n", + " text \n", + "0 Quantum computing is set to revolutionize the ... \n", + "1 Investing in index funds is a popular strategy... \n", + "2 Recent advancements in gene therapy offer new ... \n", + "3 Online learning platforms have transformed the... \n", + "4 Traveling to Europe during the off-season can ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_dataset = DocumentDataset.read_json(output_file_path, backend=\"cudf\", add_filename=write_to_filename)\n", + "output_dataset.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "nemo_curator", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb index e4bd425a..baacea88 100644 --- a/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb +++ b/tutorials/pretraining-data-curation/red-pajama-v2-curation-tutorial.ipynb @@ -1372,13 +1372,11 @@ "\n", "Fuzzy deduplication aims to find near-duplicated documents in our dataset. Near-duplicated documents are common in web crawl data due to plagiarism and mirror sites. Removing them can help improve the quality of trained models. In many cases, we can skip exact dedup and just perform fuzzy dedup as it will also find the exact duplicates. Thus, we will start with the cleaned dataset for fuzzy dedup.\n", "\n", - "Curator implements GPU-accelerated Fuzzy Deduplication based on minhash + LSH algorithm for finding similar documents across the dataset. Specifically, Fuzzy Deduplication include six steps:\n", + "NeMo Curator implements GPU-accelerated Fuzzy Deduplication based on a minhash + LSH algorithm for finding similar documents across the dataset. Specifically, Fuzzy Deduplication includes 4 steps:\n", "\n", "- Compute minhashes\n", "- Locality-Sensitive Hashing (LSH)\n", - "- Map buckets\n", - "- Jaccard shuffle\n", - "- Jaccard compute\n", + "- Buckets to Edges\n", "- Connected components\n" ] }, @@ -1435,7 +1433,7 @@ "input_data_dir = os.path.join(base_dir,\"rpv2-2023-06-en-cleaned\")\n", "seed = 42\n", "minhash_length = 260\n", - "char_ngram = 5\n", + "char_ngram = 24\n", "log_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"logs\"))\n", "id_field = 'id'\n", "text_field = 'raw_content'\n", @@ -1627,7 +1625,6 @@ "outputs": [], "source": [ "from nemo_curator import LSH\n", - "from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int\n", "\n", "lsh_input_dir = os.path.join(base_dir,\"rpv2-2023-06-minhash\")\n", "id_field = 'id'\n", @@ -1660,20 +1657,13 @@ "\n", "#Load MinHash output\n", "df = dask_cudf.read_parquet(lsh_input_dir, blocksize=\"2GB\", aggregate_files=True)\n", - "df = df.map_partitions(\n", - " convert_str_id_to_int,\n", - " id_field=id_field,\n", - " meta=cudf.DataFrame(\n", - " {minhash_field: [[1, 2, 3]], \"doc_id\": [1], \"dataset_id\": np.uint32(1)}\n", - " ),\n", - ")\n", "\n", "lsh = LSH(\n", " cache_dir=output_bucket_dir,\n", " num_hashes=minhash_length,\n", " num_buckets=num_bands,\n", " buckets_per_shuffle=buckets_per_shuffle,\n", - " id_fields=[\"dataset_id\", \"doc_id\"],\n", + " id_fields=id_field,\n", " minhash_field=minhash_field,\n", " logger=log_dir,\n", ")\n", @@ -1684,87 +1674,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "0d14ba2f-cfb8-448e-9bb8-34af6005ba15", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_id_bucket_id
025621391340430734639666
1256213913993005579456
22562139132550185011694
3256213913209210262418675
4256213913167721045053186
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id _bucket_id\n", - "0 256213913 404307346 39666\n", - "1 256213913 993005579 456\n", - "2 256213913 25501850 11694\n", - "3 256213913 2092102624 18675\n", - "4 256213913 1677210450 53186" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "lsh_result.df.head()" ] @@ -1774,10 +1689,9 @@ "id": "8dd9dbc9", "metadata": {}, "source": [ - "### 5.2.3 Map Buckets\n", + "### 5.2.3 Buckets to Edges\n", "\n", - "After performing LSH, we processed each bucket and calculated an approximation of the all-pairs Jaccard\n", - "similarity in order to remove false positive duplicates introduced by LSH. For this purpose, we will randomly sample n \"anchor\" documents within each buckets and calculate the Jaccard similarity with everything remaining in the bucket." + "`BucketsToEdges` is designed to take the bucket information from the output of LSH and create an edgelist dataset where documents with the same `_bucket_id` are connected with an edge between them. This edgelist can then be passed on the connected components to identify groups of similar documents across buckets. Since the false positive check is skipped all documents within a bucket are considered to be duplicates of each other and assigned a jaccard similarity of 1.0 to avoid edge removal during the next step." ] }, { @@ -1789,108 +1703,176 @@ }, "outputs": [], "source": [ - "from nemo_curator.modules.fuzzy_dedup._mapbuckets import _MapBuckets\n", - "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n", - " get_bucket_ddf_from_parquet_path,\n", - " get_text_ddf_from_json_path_with_blocksize,\n", - ")\n", + "from nemo_curator import BucketsToEdges\n", "\n", - "input_data_paths = [os.path.join(base_dir,\"rpv2-2023-06-en-cleaned\")]\n", - "num_files = None\n", - "text_ddf_blocksize = 256 #The block size for chunking jsonl files for text ddf in mb\n", "id_field = 'id'\n", - "text_field = 'raw_content'\n", - "input_bucket_path = os.path.join(base_dir,\"fuzzy-dedup-output-2023-06/_buckets.parquet\")\n", + "\n", + "cache_dir = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06\")\n", + "input_bucket_path = os.path.join(cache_dir,\"_buckets.parquet\")\n", "input_bucket_field = '_bucket_id'\n", - "shuffle_type ='tasks'\n", - "log_dir = os.path.join(base_dir, \"logs\")\n", - "output_anchor_docs_with_bk_path = expand_outdir_and_mkdir(os.path.join(base_dir,\"fuzzy-dedup-output-2023-06/anchor_docs_with_bk.parquet\"))" + "log_dir = os.path.join(base_dir, \"logs\")" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "f94be7c9-27ea-4f1f-8a8f-05a866eafac3", + "execution_count": null, + "id": "a0700327-5171-4673-8ded-c9f43492f582", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of files being read for jaccard calculation = 37848\n", - "ddf_text.npartitions = 21501\n" - ] - } - ], + "outputs": [], "source": [ - "# Read .jsonl input data\n", - "ddf_text = get_text_ddf_from_json_path_with_blocksize(\n", - " input_data_paths=input_data_paths,\n", - " num_files=num_files,\n", - " blocksize=text_ddf_blocksize,\n", - " id_field=id_field,\n", - " text_field=text_field,\n", + "t0 = time.time()\n", + "\n", + "# Read \"_buckets.parquet\"\n", + "ddf_bk = DocumentDataset.read_parquet(\n", + " input_bucket_path, \n", + " backend=\"cudf\"\n", + ")\n", + "\n", + "#Run _MapBuckets()\n", + "buckets_to_edges = BucketsToEdges(\n", + " cache_dir=cache_dir,\n", + " id_fields=id_field,\n", + " bucket_field=input_bucket_field, \n", + " logger=log_dir,\n", ")\n", "\n", - "print(f\"ddf_text.npartitions = {ddf_text.npartitions}\", flush=True)" + "edgelist_df = buckets_to_edges(ddf_bk)\n", + "\n", + "\n", + "print(f\"Buckets to Edgelist took {time.time()-t0} s\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "a0700327-5171-4673-8ded-c9f43492f582", + "execution_count": null, + "id": "3a986f76-191e-436d-9df3-bb68dc78d365", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "edgelist_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "fb9e1287-bd19-4728-a4c8-b92b39ca1fcc", + "metadata": {}, + "source": [ + "### 5.2.4 Connected Component\n", + "\n", + "After all buckets were processed and duplicates (at the threshold) were approximately discovered,\n", + "we constructed a sparse document graph and found the connected components therein (using scipy). Each\n", + "connected component represents a set of documents that we consider similar enough to be duplicates, and\n", + "from which we select a single representative." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9aeb619-3fab-4a18-b582-bccae3eefd17", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from nemo_curator import ConnectedComponents\n", + "\n", + "cache_dir = expand_outdir_and_mkdir(\n", + " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/cc-cache\")\n", + ")\n", + "edgelist_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/jaccard_similarity_results.parquet\")\n", + "id_field = 'id'\n", + "\n", + "output_path = expand_outdir_and_mkdir(\n", + " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/connected_components.parquet\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bee85f3-5477-4b9c-b606-7bbbefbe6cfc", "metadata": { "tags": [] }, + "outputs": [], + "source": [ + "t0 = time.time()\n", + "components_stage = ConnectedComponents(\n", + " cache_dir=cache_dir,\n", + " jaccard_pairs_path=edgelist_path,\n", + " id_column=id_field,\n", + ")\n", + "components_stage.cc_workflow(output_path=output_path)\n", + "print(f\"Connected Component took {time.time()-t0} seconds\")" + ] + }, + { + "cell_type": "markdown", + "id": "15214dcf-ff49-439e-b3d7-d8666d081027", + "metadata": {}, + "source": [ + "Let's check the results of connected components step. We can see that 239,037,733 are identified as duplicates to be removed." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "94e8126d-af15-4182-98cd-10df06e9778e", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Number of ddf_bk partitions = 102\n", - "Mapping Bucket took 711.1930673122406 s\n" + "num of docs to remove = 239037733\n" ] } ], "source": [ - "t0 = time.time()\n", - "num_workers = get_num_workers(gpu_client)\n", + "output_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/connected_components.parquet\")\n", + "cc_result = dask_cudf.read_parquet(output_path, split_row_groups=False).repartition(npartitions=1)\n", "\n", - "# Read \"_buckets.parquet\"\n", - "ddf_bk = get_bucket_ddf_from_parquet_path(\n", - " input_bucket_path=input_bucket_path, \n", - " num_workers=num_workers\n", - ")\n", + "# Set 'group' as the index and shuffle to ensure all same 'group' values are in the same partition\n", + "cc_result = cc_result.set_index('group', shuffle='tasks')\n", "\n", - "#Run _MapBuckets()\n", - "map_buckets = _MapBuckets(\n", - " id_fields=[\"dataset_id\", \"doc_id\"], \n", - " bucket_field=input_bucket_field, \n", - " logger=log_dir,\n", - " text_field=text_field,\n", - ")\n", + "# Define a function to assign cumulative counts and filter duplicates\n", + "def assign_cumcount(df):\n", + " df['cumcount'] = df.groupby(level=0).cumcount()\n", + " df = df[df['cumcount'] >= 1]\n", + " df = df.drop(columns=['cumcount'])\n", + " return df\n", "\n", - "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(\n", - " documents_df=ddf_text, \n", - " buckets_df=ddf_bk, \n", - " shuffle_type=shuffle_type\n", - ")\n", + "# Find duplicates by applying the function to each partition\n", + "docs_to_remove = cc_result.map_partitions(assign_cumcount, meta=cc_result)\n", "\n", - "#Write to disk\n", - "ddf_anchor_docs_with_bk.to_parquet(\n", - " output_anchor_docs_with_bk_path, \n", - " write_index=False\n", - ")\n", + "# Reset the index\n", + "docs_to_remove = docs_to_remove.reset_index()\n", + "\n", + "docs_to_remove = docs_to_remove[[\"id\"]]\n", + "docs_to_remove = docs_to_remove.rename(columns={\"id\":\"to_remove_doc_id\"})\n", + "docs_to_remove = docs_to_remove.reset_index(drop=True).persist()\n", + "_ = wait(docs_to_remove)\n", + "del _ \n", "\n", - "print(f\"Mapping Bucket took {time.time()-t0} s\")" + "print(\"num of docs to remove =\", len(docs_to_remove))" + ] + }, + { + "cell_type": "markdown", + "id": "568ee0b5-f2dd-4d34-917f-56f4211a36fe", + "metadata": {}, + "source": [ + "We can examine the size of the duplicate clusters. The largest cluster has 775,379 near duplicates." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "3a986f76-191e-436d-9df3-bb68dc78d365", + "execution_count": 7, + "id": "cae7f166-836a-4c21-bff2-7453254956b7", "metadata": { "tags": [] }, @@ -1916,770 +1898,194 @@ " \n", " \n", " \n", - " dataset_id\n", - " doc_id\n", - " anchor_1_dataset_id\n", - " anchor_1_doc_id\n", - " anchor_0_dataset_id\n", - " anchor_0_doc_id\n", - " _output_partition_id\n", + " count\n", + " \n", + " \n", + " group\n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 256213913\n", - " 1440621805\n", - " 256213913\n", - " 520733492\n", - " 256213913\n", - " 2401230703\n", - " 1461\n", + " 350652173\n", + " 775379\n", " \n", " \n", - " 1\n", - " 256213913\n", - " 821232404\n", - " 256213913\n", - " 371332453\n", - " 256213913\n", - " 821232404\n", - " 3852\n", + " 93521324\n", + " 493227\n", " \n", " \n", - " 2\n", - " 256213913\n", - " 1787805617\n", - " 256213913\n", - " 1969113640\n", - " 256213913\n", - " 397634875\n", - " 7811\n", + " 24\n", + " 112861\n", " \n", " \n", - " 3\n", - " 256213913\n", - " 658706900\n", - " 256213913\n", - " 658706900\n", - " 256213913\n", - " 675310236\n", - " 3403\n", + " 319292355\n", + " 96224\n", " \n", " \n", - " 4\n", - " 256213913\n", - " 272735412\n", - " 256213913\n", - " 272735412\n", - " 256213913\n", - " 2250835581\n", - " 5160\n", + " 70141069\n", + " 67474\n", " \n", " \n", "\n", "" ], "text/plain": [ - " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", - "0 256213913 1440621805 256213913 520733492 \n", - "1 256213913 821232404 256213913 371332453 \n", - "2 256213913 1787805617 256213913 1969113640 \n", - "3 256213913 658706900 256213913 658706900 \n", - "4 256213913 272735412 256213913 272735412 \n", - "\n", - " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", - "0 256213913 2401230703 1461 \n", - "1 256213913 821232404 3852 \n", - "2 256213913 397634875 7811 \n", - "3 256213913 675310236 3403 \n", - "4 256213913 2250835581 5160 " + " count\n", + "group \n", + "350652173 775379\n", + "93521324 493227\n", + "24 112861\n", + "319292355 96224\n", + "70141069 67474" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ddf_anchor_docs_with_bk.head()" + "cc_grouped = cc_result.groupby('group').agg({'id': 'count'}).rename(columns={'id': 'count'}).sort_values('count', ascending=False).compute()\n", + "cc_grouped.head()" ] }, { "cell_type": "markdown", - "id": "58e90363-ebfe-48e2-8f7c-ff7ef27c97d5", + "id": "0def7323-3d2c-4861-9b7e-a1e296ccf329", "metadata": {}, "source": [ - "### 5.2.4 Jaccard Shuffle\n", - "\n", - "We shuffle the documents within the dataset based on their bucket assignments, essentially distributing similar documents across different partitions or workers, enabling efficient parallel processing and deduplication in subsequent steps." + "[Optional] Verify if fuzzy duplicates are similar. For example, we can look into the largest group \"350652173\"." ] }, { "cell_type": "code", "execution_count": null, - "id": "11d7184d-4ca5-4b49-85b4-1264056f5c33", + "id": "e22cb491-c2ab-4ec4-8313-ae2bcd66a352", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from nemo_curator.modules.fuzzy_dedup._shuffle import _Shuffle\n", - "\n", - "log_dir = os.path.join(base_dir, \"logs\")\n", - "input_anchor_docs_with_bk_path = os.path.join(base_dir,\"fuzzy-dedup-output-2023-06/anchor_docs_with_bk.parquet\")\n", - "output_shuffled_docs_path = expand_outdir_and_mkdir(\n", - " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/shuffled_docs.parquet\")\n", - ")\n", - "bucket_mapping_ddf_blocksize = 256\n", - "parts_per_worker = 16\n", - "bucket_parts_per_worker = 256\n", - "id_field = 'id'\n", - "text_field = 'raw_content'" + "dup_group = cc_result.loc[350652173].compute()\n", + "dup_group.head()" + ] + }, + { + "cell_type": "markdown", + "id": "170c1cf4-8cb9-4f10-aab3-acfdaa9e5b16", + "metadata": {}, + "source": [ + "We will examine the first five documents in this cluster:" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "07894e81-6cdc-4292-951a-977b220fbd81", + "execution_count": 4, + "id": "00cf923e-fd4e-41b9-a00f-801c186ac70e", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/1 [00:00.wait() done, defined at /usr/local/lib/python3.10/dist-packages/distributed/client.py:2197> exception=AllExit()>\n", - "Traceback (most recent call last):\n", - " File \"/usr/local/lib/python3.10/dist-packages/distributed/client.py\", line 2206, in wait\n", - " raise AllExit()\n", - "distributed.client.AllExit\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Text-df partition 5376/21501 completed in 44.53200340270996\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21130783 rows to disk\n", - "Text-df partition 5632/21501 completed in 54.21355414390564\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21283077 rows to disk\n", - "Text-df partition 5888/21501 completed in 41.81457304954529\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22384930 rows to disk\n", - "Text-df partition 6144/21501 completed in 45.46053504943848\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20776364 rows to disk\n", - "Text-df partition 6400/21501 completed in 40.972795248031616\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20072714 rows to disk\n", - "Text-df partition 6656/21501 completed in 43.9665105342865\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21287119 rows to disk\n", - "Text-df partition 6912/21501 completed in 46.75365734100342\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21389175 rows to disk\n", - "Text-df partition 7168/21501 completed in 44.202338457107544\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20580021 rows to disk\n", - "Text-df partition 7424/21501 completed in 45.469704151153564\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21695552 rows to disk\n", - "Text-df partition 7680/21501 completed in 42.65142750740051\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20388309 rows to disk\n", - "Text-df partition 7936/21501 completed in 47.71988654136658\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21259333 rows to disk\n", - "Text-df partition 8192/21501 completed in 49.69535183906555\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22171199 rows to disk\n", - "Text-df partition 8448/21501 completed in 43.66416621208191\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22743079 rows to disk\n", - "Text-df partition 8704/21501 completed in 44.621586322784424\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21899187 rows to disk\n", - "Text-df partition 8960/21501 completed in 44.56813859939575\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22286972 rows to disk\n", - "Text-df partition 9216/21501 completed in 54.81862425804138\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22179900 rows to disk\n", - "Text-df partition 9472/21501 completed in 43.12162232398987\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21492583 rows to disk\n", - "Text-df partition 9728/21501 completed in 75.82933282852173\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22016268 rows to disk\n", - "Text-df partition 9984/21501 completed in 42.6993567943573\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21855524 rows to disk\n", - "Text-df partition 10240/21501 completed in 52.208579778671265\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21876461 rows to disk\n", - "Text-df partition 10496/21501 completed in 44.576396465301514\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21738436 rows to disk\n", - "Text-df partition 10752/21501 completed in 42.986634969711304\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21302223 rows to disk\n", - "Text-df partition 11008/21501 completed in 39.9963161945343\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20698714 rows to disk\n", - "Text-df partition 11264/21501 completed in 40.962194204330444\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22158633 rows to disk\n", - "Text-df partition 11520/21501 completed in 49.96597933769226\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22048695 rows to disk\n", - "Text-df partition 11776/21501 completed in 45.995996713638306\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21702006 rows to disk\n", - "Text-df partition 12032/21501 completed in 46.70681190490723\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21347159 rows to disk\n", - "Text-df partition 12288/21501 completed in 81.4986321926117\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21955887 rows to disk\n", - "Text-df partition 12544/21501 completed in 42.991360902786255\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20738788 rows to disk\n", - "Text-df partition 12800/21501 completed in 45.26166224479675\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21851660 rows to disk\n", - "Text-df partition 13056/21501 completed in 43.972177028656006\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21509705 rows to disk\n", - "Text-df partition 13312/21501 completed in 42.279316902160645\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21253609 rows to disk\n", - "Text-df partition 13568/21501 completed in 42.06918454170227\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21327966 rows to disk\n", - "Text-df partition 13824/21501 completed in 42.79487657546997\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21643067 rows to disk\n", - "Text-df partition 14080/21501 completed in 46.259148597717285\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21428884 rows to disk\n", - "Text-df partition 14336/21501 completed in 41.45153284072876\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22143560 rows to disk\n", - "Text-df partition 14592/21501 completed in 64.21475219726562\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22325699 rows to disk\n", - "Text-df partition 14848/21501 completed in 44.90940022468567\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21819547 rows to disk\n", - "Text-df partition 15104/21501 completed in 42.385361671447754\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22358380 rows to disk\n", - "Text-df partition 15360/21501 completed in 44.72035098075867\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21915533 rows to disk\n", - "Text-df partition 15616/21501 completed in 62.369131565093994\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21881549 rows to disk\n", - "Text-df partition 15872/21501 completed in 41.794671297073364\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22808310 rows to disk\n", - "Text-df partition 16128/21501 completed in 42.90521025657654\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22373007 rows to disk\n", - "Text-df partition 16384/21501 completed in 44.44476580619812\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22500153 rows to disk\n", - "Text-df partition 16640/21501 completed in 63.39798164367676\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21869272 rows to disk\n", - "Text-df partition 16896/21501 completed in 68.16630506515503\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22681261 rows to disk\n", - "Text-df partition 17152/21501 completed in 43.95643997192383\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22528095 rows to disk\n", - "Text-df partition 17408/21501 completed in 43.677578926086426\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22315077 rows to disk\n", - "Text-df partition 17664/21501 completed in 47.376683712005615\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22590009 rows to disk\n", - "Text-df partition 17920/21501 completed in 45.69715094566345\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22371144 rows to disk\n", - "Text-df partition 18176/21501 completed in 45.908634662628174\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22915118 rows to disk\n", - "Text-df partition 18432/21501 completed in 45.64804434776306\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22445027 rows to disk\n", - "Text-df partition 18688/21501 completed in 48.78413009643555\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21732923 rows to disk\n", - "Text-df partition 18944/21501 completed in 42.96047925949097\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22289403 rows to disk\n", - "Text-df partition 19200/21501 completed in 63.846845626831055\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21964165 rows to disk\n", - "Text-df partition 19456/21501 completed in 49.25983190536499\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22401629 rows to disk\n", - "Text-df partition 19712/21501 completed in 48.457353591918945\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22404881 rows to disk\n", - "Text-df partition 19968/21501 completed in 45.16793203353882\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22410519 rows to disk\n", - "Text-df partition 20224/21501 completed in 46.03614926338196\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22222649 rows to disk\n", - "Text-df partition 20480/21501 completed in 42.315980195999146\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22390142 rows to disk\n", - "Text-df partition 20736/21501 completed in 60.00182127952576\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 22521054 rows to disk\n", - "Text-df partition 20992/21501 completed in 46.08659911155701\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 20889117 rows to disk\n", - "Text-df partition 21248/21501 completed in 45.099570751190186\n", - "Using 256 text partitions.\n", - "Starting text bytes aware shuffle\n", - "Will write 21190706 rows to disk\n", - "Text-df partition 21501/21501 completed in 81.1153199672699\n", - "Bucket partition 102/102 completed in 4069.427500486374\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1/1 [1:07:49<00:00, 4069.43s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jaccard Shuffle took 4069.6330242156982 s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "Reading 37848 files\n" ] } ], "source": [ - "t0 = time.time()\n", - "\n", - "shuffle = _Shuffle(\n", - " id_fields=[\"dataset_id\", \"doc_id\"],\n", - " text_field=text_field,\n", - " int_to_str_id=id_field,\n", - " logger=log_dir,\n", - ")\n", - "\n", - "shuffle.shuffle_docs_on_buckets(\n", - " documents_df=ddf_text,\n", - " bucket_w_anchors_path=input_anchor_docs_with_bk_path,\n", - " output_shuffled_docs_path=output_shuffled_docs_path,\n", - " bucket_mapping_df_blocksize=bucket_mapping_ddf_blocksize,\n", - " parts_per_worker=parts_per_worker,\n", - " bucket_parts_per_worker=bucket_parts_per_worker,\n", - " partition_on=\"_output_partition_id\",\n", - ")\n", - "\n", - "print(f\"Jaccard Shuffle took {time.time()-t0} s\")" + "# read input dataset\n", + "input_data_dir = os.path.join(base_dir, \"rpv2-2023-06-en-cleaned\")\n", + "input_dataset = DocumentDataset.read_json(input_data_dir, add_filename=True)" ] }, { "cell_type": "markdown", - "id": "26739f23-47f1-4e11-ac49-82920f534495", + "id": "9772bf71-9e18-4e59-b9f8-ebd9053c79b0", "metadata": {}, "source": [ - "We can visualize the jaccard shuffle results for a single partition:" + "Let's visualize the content of these documents and see if they are similar (ids may change so revise the `dup_ids` as needed)." ] }, { "cell_type": "code", - "execution_count": null, - "id": "3576685a-c3d8-4950-bac3-5412e9f876d2", + "execution_count": 10, + "id": "e3cc167f-30f8-470d-99e3-0a2d916d46bf", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "jaccard_shuffle_res = dd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0\"))\n", - "jaccard_shuffle_res.head()" - ] - }, - { - "cell_type": "markdown", - "id": "80309eef-733a-4926-875f-cb94bbb4d8fa", - "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching for near duplicate examples with specific IDs took 610.5046670436859 seconds\n" + ] + } + ], "source": [ - "### 5.2.5 Jaccard Compute\n", - "\n", - "Now we have the jaccard pairs sampled, we can compute the Jaccard similarity score for all pairs." + "t0 = time.time()\n", + "dup_ids = [\n", + " 'rpv2-2023-06-1285625132',\n", + " 'rpv2-2023-06-2033200488',\n", + " 'rpv2-2023-06-0428016172',\n", + " 'rpv2-2023-06-1268721963',\n", + " 'rpv2-2023-06-1285428574'\n", + "] \n", + "dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()\n", + "print(f\"Searching for near duplicate examples with specific IDs took {time.time()-t0} seconds\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "573dccf7-2e23-4aae-a3ec-2b9e1a42d97d", + "id": "655a4b3e-8a48-441c-8e12-9b2d287b79ec", "metadata": { "tags": [] }, "outputs": [], "source": [ - "from nemo_curator import JaccardSimilarity\n", - "\n", - "id_field = 'id'\n", - "text_field = 'raw_content'\n", - "ngram_size = 5\n", - "shuffled_docs_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/shuffled_docs.parquet\")\n", - "jaccard_results_path = expand_outdir_and_mkdir(\n", - " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/jaccard_similarity_results.parquet\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d31a24fe-8229-48bc-89f7-32f2b93f4f5c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Jaccard Computing+Writing took 5886.298990488052 seconds\n" - ] - } - ], - "source": [ - "t0 = time.time()\n", - "jaccard = JaccardSimilarity(\n", - " id_field=id_field ,\n", - " text_field=text_field,\n", - " anchor_id_fields=[f\"anchor_{i}_{id_field}\" for i in range(2)],\n", - " ngram_width=ngram_size,\n", - ")\n", - "\n", - "# Run actual computation\n", - "result_df = jaccard.jaccard_compute(shuffled_docs_path)\n", - "\n", - "result_df.to_parquet(\n", - " jaccard_results_path,\n", - " write_index=False,\n", - " write_metadata_file=False,\n", - ")\n", - "\n", - "print(f\"Jaccard Computing+Writing took {time.time() - t0} seconds\")" + "dup_examples" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "bbebed9d-d920-4b8e-8c88-48f1906c46e3", + "execution_count": null, + "id": "49b1b00a-501e-4d49-a93b-60a5c8ae87d2", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_xid_yjaccard
0256213913-1894624904256213913-23465249570.956566
1256213913-1785625062256213913-20997256750.973642
2256213913-1350425062256213913-29301251421.000000
3256213913-1324822256213913-13842036090.988306
4256213913-1775024761256213913-15401197740.906369
\n", - "
" - ], - "text/plain": [ - " id_x id_y jaccard\n", - "0 256213913-1894624904 256213913-2346524957 0.956566\n", - "1 256213913-1785625062 256213913-2099725675 0.973642\n", - "2 256213913-1350425062 256213913-2930125142 1.000000\n", - "3 256213913-1324822 256213913-1384203609 0.988306\n", - "4 256213913-1775024761 256213913-1540119774 0.906369" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "jaccard_compute_res = dd.read_parquet(jaccard_results_path)\n", - "jaccard_compute_res.head()" + "print('Example duplicate 1\\n' + dup_examples.raw_content.iloc[0])\n", + "print('\\n\\nExample duplicate 2\\n' + dup_examples.raw_content.iloc[1])\n", + "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[2])\n", + "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[3])\n", + "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[4])" ] }, { "cell_type": "markdown", - "id": "fb9e1287-bd19-4728-a4c8-b92b39ca1fcc", + "id": "c428e09c-5442-4908-8888-4e62994e4c5c", "metadata": {}, "source": [ - "### 5.2.6 Connected Component\n", - "\n", - "After all buckets were processed and duplicates (at the threshold) were approximately discovered,\n", - "we constructed a sparse document graph and found the connected components therein (using scipy). Each\n", - "connected component represents a set of documents that we consider similar enough to be duplicates, and\n", - "from which we select a single representative." + "### 5.2.5 Duplicates Removal" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "f9aeb619-3fab-4a18-b582-bccae3eefd17", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "f2e01b84-07cc-45a3-9dde-97884d1922a3", + "metadata": {}, "source": [ - "from nemo_curator import ConnectedComponents\n", - "\n", - "cache_dir = expand_outdir_and_mkdir(\n", - " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/cc-cache\")\n", - ")\n", - "jaccard_pairs_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/jaccard_similarity_results.parquet\")\n", - "id_field = 'id'\n", - "jaccard_threshold = 0.8\n", - "output_path = expand_outdir_and_mkdir(\n", - " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/connected_components.parquet\")\n", - ")" + "Next, we will proceed to remove the duplicates identified from the dataset." ] }, { "cell_type": "code", - "execution_count": null, - "id": "6bee85f3-5477-4b9c-b606-7bbbefbe6cfc", + "execution_count": 3, + "id": "7cde97c5-cfaa-4096-8d9f-1ffa19c4adb2", "metadata": { "tags": [] }, @@ -2688,954 +2094,193 @@ "name": "stdout", "output_type": "stream", "text": [ - "batch_id = 0/33, time = 10.98209285736084\n", - "batch_id = 1/33, time = 7.240729331970215\n", - "batch_id = 2/33, time = 11.506417274475098\n", - "batch_id = 3/33, time = 10.567672729492188\n", - "batch_id = 4/33, time = 4.118508815765381\n", - "batch_id = 5/33, time = 11.475081443786621\n", - "batch_id = 6/33, time = 4.485937118530273\n", - "batch_id = 7/33, time = 7.7934770584106445\n", - "batch_id = 8/33, time = 12.659213781356812\n", - "batch_id = 9/33, time = 10.357794523239136\n", - "batch_id = 10/33, time = 15.211389780044556\n", - "batch_id = 11/33, time = 11.50840425491333\n", - "batch_id = 12/33, time = 6.360927104949951\n", - "batch_id = 13/33, time = 6.977228403091431\n", - "batch_id = 14/33, time = 14.863914489746094\n", - "batch_id = 15/33, time = 8.78640341758728\n", - "batch_id = 16/33, time = 17.97274613380432\n", - "batch_id = 17/33, time = 15.662312030792236\n", - "batch_id = 18/33, time = 12.669589042663574\n", - "batch_id = 19/33, time = 11.13182783126831\n", - "batch_id = 20/33, time = 4.032534837722778\n", - "batch_id = 21/33, time = 10.532259702682495\n", - "batch_id = 22/33, time = 11.531543016433716\n", - "batch_id = 23/33, time = 4.218446731567383\n", - "batch_id = 24/33, time = 4.118865251541138\n", - "batch_id = 25/33, time = 16.80798053741455\n", - "batch_id = 26/33, time = 14.314243078231812\n", - "batch_id = 27/33, time = 24.660512447357178\n", - "batch_id = 28/33, time = 18.43391704559326\n", - "batch_id = 29/33, time = 10.506544589996338\n", - "batch_id = 30/33, time = 17.96251106262207\n", - "batch_id = 31/33, time = 10.470972061157227\n", - "batch_id = 32/33, time = 1.9526703357696533\n", - "# of groups 134984907\n", - "# of docs removed 239037733\n", - "assert num_nodes:374022640==labels_df:374022640 passed\n", - "Connected Component took 445.6 seconds\n", - "\n" + "Reading 37848 files\n" ] } ], "source": [ - "t0 = time.time()\n", - "components_stage = ConnectedComponents(\n", - " cache_dir=cache_dir,\n", - " jaccard_pairs_path=jaccard_pairs_path,\n", - " id_field=id_field,\n", - " jaccard_threshold=jaccard_threshold,\n", - ")\n", - "components_stage.cc_workflow(output_path=output_path)\n", - "print(f\"Connected Component took {time.time()-t0} seconds\")" + "\n", + "input_dataset = DocumentDataset.read_json(os.path.join(base_dir, \"rpv2-2023-06-en-cleaned\"), backend=\"cudf\")\n", + "input_df = input_dataset.df[['raw_content','id']]" ] }, { "cell_type": "markdown", - "id": "15214dcf-ff49-439e-b3d7-d8666d081027", + "id": "99e50a79-18b0-4d1b-bab0-cfd1ec9b62fd", "metadata": {}, "source": [ - "Let's check the results of connected components step. We can see that 239,037,733 are identified as duplicates to be removed." + "Then, we will perform a merge between the `input_df` and the `docs_to_remove` on the IDs and drop the fuzzy duplicates." ] }, { "cell_type": "code", - "execution_count": 5, - "id": "94e8126d-af15-4182-98cd-10df06e9778e", - "metadata": {}, + "execution_count": 6, + "id": "f819d1d3-c4c0-4288-b190-86276d221050", + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "num of docs to remove = 239037733\n" + "Removing duplicates and writing deduped dataset took 1241.3191509246826 seconds\n" ] } ], "source": [ - "output_path = os.path.join(base_dir, \"fuzzy-dedup-output-2023-06/connected_components.parquet\")\n", - "cc_result = dask_cudf.read_parquet(output_path, split_row_groups=False).repartition(npartitions=1)\n", - "\n", - "# Set 'group' as the index and shuffle to ensure all same 'group' values are in the same partition\n", - "cc_result = cc_result.set_index('group', shuffle='tasks')\n", - "\n", - "# Define a function to assign cumulative counts and filter duplicates\n", - "def assign_cumcount(df):\n", - " df['cumcount'] = df.groupby(level=0).cumcount()\n", - " df = df[df['cumcount'] >= 1]\n", - " df = df.drop(columns=['cumcount'])\n", - " return df\n", - "\n", - "# Find duplicates by applying the function to each partition\n", - "docs_to_remove = cc_result.map_partitions(assign_cumcount, meta=cc_result)\n", - "\n", - "# Reset the index\n", - "docs_to_remove = docs_to_remove.reset_index()\n", + "dedup_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"rpv2-2023-06-deduped\"))\n", + "deduped_df = input_df.merge(docs_to_remove,\n", + " left_on=['id'],\n", + " right_on=[\"to_remove_doc_id\"],\n", + " how='left')\n", "\n", - "docs_to_remove = docs_to_remove[[\"dataset_id\", \"doc_id\"]]\n", - "docs_to_remove = docs_to_remove.rename(columns={\"dataset_id\":\"to_remove_dataset_id\", \"doc_id\":\"to_remove_doc_id\"})\n", - "docs_to_remove = docs_to_remove.reset_index(drop=True).persist()\n", - "_ = wait(docs_to_remove)\n", - "del _ \n", + "deduped_df = deduped_df[deduped_df['to_remove_doc_id'].isna()].drop(columns=['to_remove_doc_id']).reset_index(drop=True)\n", "\n", - "print(\"num of docs to remove =\", len(docs_to_remove))" + "t0 = time.time()\n", + "deduped_df.to_parquet(dedup_output_dir)\n", + "print(f\"Removing duplicates and writing deduped dataset took {time.time()-t0} seconds\")" ] }, { "cell_type": "markdown", - "id": "568ee0b5-f2dd-4d34-917f-56f4211a36fe", + "id": "f987b1ac-c8e3-45f8-8e9b-32befc6667aa", "metadata": {}, "source": [ - "We can examine the size of the duplicate clusters. The largest cluster has 775,379 near duplicates." + "To verify the results, we can confirm that we have 849,273,787 documents left compared to 1,088,311,520 in the input dataset, essentially removing 239,037,733 duplicates." ] }, { "cell_type": "code", - "execution_count": 7, - "id": "cae7f166-836a-4c21-bff2-7453254956b7", + "execution_count": 6, + "id": "cf56c9ca-27cb-4f03-921b-685d523cf43e", "metadata": { "tags": [] }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count
group
350652173775379
93521324493227
24112861
31929235596224
7014106967474
\n", - "
" - ], - "text/plain": [ - " count\n", - "group \n", - "350652173 775379\n", - "93521324 493227\n", - "24 112861\n", - "319292355 96224\n", - "70141069 67474" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cc_grouped = cc_result.groupby('group').agg({'doc_id': 'count'}).rename(columns={'doc_id': 'count'}).sort_values('count', ascending=False).compute()\n", - "cc_grouped.head()" - ] - }, - { - "cell_type": "markdown", - "id": "0def7323-3d2c-4861-9b7e-a1e296ccf329", - "metadata": {}, - "source": [ - "[Optional] Verify if fuzzy duplicates are similar. For example, we can look into the largest group \"350652173\"." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e22cb491-c2ab-4ec4-8313-ae2bcd66a352", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_id
group
3506521732562139131285625132
3506521732562139132033200488
350652173256213913428016172
3506521732562139131268721963
3506521732562139131285428574
\n", - "
" - ], - "text/plain": [ - " dataset_id doc_id\n", - "group \n", - "350652173 256213913 1285625132\n", - "350652173 256213913 2033200488\n", - "350652173 256213913 428016172\n", - "350652173 256213913 1268721963\n", - "350652173 256213913 1285428574" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dup_group = cc_result.loc[350652173].compute()\n", - "dup_group.head()" - ] - }, - { - "cell_type": "markdown", - "id": "170c1cf4-8cb9-4f10-aab3-acfdaa9e5b16", - "metadata": {}, - "source": [ - "We will examine the first five documents in this cluster:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "00cf923e-fd4e-41b9-a00f-801c186ac70e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 37848 files\n" - ] - } - ], - "source": [ - "# read input dataset\n", - "input_data_dir = os.path.join(base_dir, \"rpv2-2023-06-en-cleaned\")\n", - "input_dataset = DocumentDataset.read_json(input_data_dir, add_filename=True)" - ] - }, - { - "cell_type": "markdown", - "id": "9772bf71-9e18-4e59-b9f8-ebd9053c79b0", - "metadata": {}, - "source": [ - "Let's visualize the content of these documents and see if they are similar (ids may change so revise the `dup_ids` as needed)." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e3cc167f-30f8-470d-99e3-0a2d916d46bf", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Searching for near duplicate examples with specific IDs took 610.5046670436859 seconds\n" - ] - } - ], - "source": [ - "t0 = time.time()\n", - "dup_ids = [\n", - " 'rpv2-2023-06-1285625132',\n", - " 'rpv2-2023-06-2033200488',\n", - " 'rpv2-2023-06-0428016172',\n", - " 'rpv2-2023-06-1268721963',\n", - " 'rpv2-2023-06-1285428574'\n", - "] \n", - "dup_examples = input_dataset.df[input_dataset.df['id'].isin(dup_ids)].compute()\n", - "print(f\"Searching for near duplicate examples with specific IDs took {time.time()-t0} seconds\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "655a4b3e-8a48-441c-8e12-9b2d287b79ec", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dup_examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49b1b00a-501e-4d49-a93b-60a5c8ae87d2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print('Example duplicate 1\\n' + dup_examples.raw_content.iloc[0])\n", - "print('\\n\\nExample duplicate 2\\n' + dup_examples.raw_content.iloc[1])\n", - "print('\\n\\nExample duplicate 3\\n' + dup_examples.raw_content.iloc[2])\n", - "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[3])\n", - "print('\\n\\nExample duplicate 4\\n' + dup_examples.raw_content.iloc[4])" - ] - }, - { - "cell_type": "markdown", - "id": "c428e09c-5442-4908-8888-4e62994e4c5c", - "metadata": {}, - "source": [ - "### 5.2.7 Duplicates Removal" - ] - }, - { - "cell_type": "markdown", - "id": "f2e01b84-07cc-45a3-9dde-97884d1922a3", - "metadata": {}, - "source": [ - "Next, we will proceed to remove the duplicates identified from the dataset. We will first change the string ID to `doc_id` and `dataset_id` in the input dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7cde97c5-cfaa-4096-8d9f-1ffa19c4adb2", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 37848 files\n" - ] - } - ], - "source": [ - "from helper import convert_str_id_to_int\n", - "\n", - "input_dataset = DocumentDataset.read_json(os.path.join(base_dir, \"rpv2-2023-06-en-cleaned\"), backend=\"cudf\")\n", - "input_df = input_dataset.df[['raw_content','id']]\n", - "meta = input_df._meta\n", - "meta['doc_id']=np.int64([0])\n", - "meta['dataset_id']=np.uint32([0])\n", - "input_df = input_df.map_partitions(\n", - " convert_str_id_to_int,\n", - " id_field=\"id\",\n", - " meta=meta,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "99e50a79-18b0-4d1b-bab0-cfd1ec9b62fd", - "metadata": {}, - "source": [ - "Then, we will perform a merge between the `input_df` and the `docs_to_remove` on the IDs and drop the fuzzy duplicates." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f819d1d3-c4c0-4288-b190-86276d221050", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removing duplicates and writing deduped dataset took 1241.3191509246826 seconds\n" - ] - } - ], - "source": [ - "dedup_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"rpv2-2023-06-deduped\"))\n", - "deduped_df = input_df.merge(docs_to_remove,\n", - " left_on=['doc_id','dataset_id'],\n", - " right_on=[\"to_remove_doc_id\", \"to_remove_dataset_id\"],\n", - " how='left')\n", - "\n", - "deduped_df = deduped_df[deduped_df['to_remove_doc_id'].isna()].drop(columns=['to_remove_doc_id', \"to_remove_dataset_id\"]).reset_index(drop=True)\n", - "\n", - "t0 = time.time()\n", - "deduped_df.to_parquet(dedup_output_dir)\n", - "print(f\"Removing duplicates and writing deduped dataset took {time.time()-t0} seconds\")" - ] - }, - { - "cell_type": "markdown", - "id": "f987b1ac-c8e3-45f8-8e9b-32befc6667aa", - "metadata": {}, - "source": [ - "To verify the results, we can confirm that we have 849,273,787 documents left compared to 1,088,311,520 in the input dataset, essentially removing 239,037,733 duplicates." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "cf56c9ca-27cb-4f03-921b-685d523cf43e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "849273787" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(deduped_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "19f96f15-1ac6-40ab-9e04-1586531bb55f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "1088311520" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(input_df)" - ] - }, - { - "cell_type": "markdown", - "id": "cdc6350e-2363-4b13-ac67-0cbc23ad981d", - "metadata": {}, - "source": [ - "## 5.3 Inter-snapshot Deduplication" - ] - }, - { - "cell_type": "markdown", - "id": "888c2b15-961f-4a73-a0a3-15474ae4134c", - "metadata": {}, - "source": [ - "So far we have deduplicated a single snapshot from rpv2. Pre-training dataet include multiple snapshots so we will often need to perform inter-snapshot deduplication. For this tutorial, we will demostrate deduplication across two snapshots as an example.\n", - "\n", - "We first performed all the above steps for another snapshot `2023-14` and then combined the two deduped datasets into one and stored them in `rpv2-2023-06-and-14-deduped`.\n", - "\n", - "Next, we will perform the fuzzy deduplication on the combined dataset." - ] - }, - { - "cell_type": "markdown", - "id": "2a1445cc-b69c-4007-8f09-75a8eb8f699c", - "metadata": {}, - "source": [ - "### 5.3.1 Compute Minhash" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1461b61-887c-4099-bd9f-32e79dc5fdbb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from nemo_curator import MinHash\n", - "from nemo_curator import LSH\n", - "from nemo_curator.modules.fuzzy_dedup._mapbuckets import _MapBuckets\n", - "from nemo_curator.modules.fuzzy_dedup._shuffle import _Shuffle\n", - "from nemo_curator import ConnectedComponents\n", - "from nemo_curator import JaccardSimilarity\n", - "\n", - "from nemo_curator.utils.file_utils import reshard_jsonl\n", - "from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import convert_str_id_to_int\n", - "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n", - " get_bucket_ddf_from_parquet_path,\n", - " get_text_ddf_from_json_path_with_blocksize,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "efaed1ed-e6d1-4117-9b0b-fe0d20960b60", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "seed = 42\n", - "minhash_length = 260\n", - "char_ngram = 5\n", - "log_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"logs\"))\n", - "id_field = 'id'\n", - "text_field = 'raw_content'\n", - "minshah_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir,\"rpv2-2023-06-and-14-minhash\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b4bf2d09-6601-4bd2-a6f2-f738cffd8885", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "input_data_dir = os.path.join(base_dir,\"rpv2-2023-06-and-14-deduped\")\n", - "\n", - "files = []\n", - "for file in os.listdir(input_data_dir):\n", - " if file.endswith('.part'):\n", - " new_file = file.replace('.part', '.jsonl')\n", - " old_file_path = os.path.join(input_data_dir, file)\n", - " new_file_path = os.path.join(input_data_dir, new_file)\n", - " os.rename(old_file_path, new_file_path)\n", - " files.append(new_file_path)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "38197174-738e-42a4-a38a-1dbd7d84836d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Reading 72797 files\n" - ] - } - ], - "source": [ - "files = [f for f in files if f.endswith(\".jsonl\")]\n", - "df = read_data(\n", - " files,\n", - " file_type=\"jsonl\",\n", - " backend=\"cudf\",\n", - " files_per_partition=2,\n", - " add_filename=False,\n", - ")[[id_field, text_field]]" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "439add9c-9f51-4481-95cf-456dc5be9fd2", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computing minhashes took:6115.702769517899\n" - ] - } - ], - "source": [ - "t0 = time.time()\n", - "\n", - "# Run MinHash() on input data\n", - "minhasher = MinHash(\n", - " seed=seed,\n", - " num_hashes=minhash_length,\n", - " char_ngrams=char_ngram,\n", - " use_64bit_hash=False,\n", - " logger=log_dir,\n", - " id_field=id_field,\n", - " text_field=text_field,\n", - " cache_dir=minshah_output_dir\n", - ")\n", - "\n", - "result = minhasher(DocumentDataset(df)).df\n", - "\n", - "print(f\"Computing minhashes took:{time.time()-t0}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "1c998d64-54f8-49b0-8e0c-2e5727596e84", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_minhash_signature
0rpv2-2023-06-0678400000[36422228, 15993596, 3538361, 16103012, 194100...
1rpv2-2023-06-0678500000[34662, 17635, 1112347, 293654, 313382, 160184...
2rpv2-2023-06-0678600000[15076006, 1801689, 3181854, 2949398, 5699436,...
3rpv2-2023-06-0678700000[13528976, 2438382, 26260517, 26187347, 249748...
4rpv2-2023-06-0678800000[2550974, 157261, 1536526, 1169030, 576861, 10...
\n", - "
" - ], "text/plain": [ - " id _minhash_signature\n", - "0 rpv2-2023-06-0678400000 [36422228, 15993596, 3538361, 16103012, 194100...\n", - "1 rpv2-2023-06-0678500000 [34662, 17635, 1112347, 293654, 313382, 160184...\n", - "2 rpv2-2023-06-0678600000 [15076006, 1801689, 3181854, 2949398, 5699436,...\n", - "3 rpv2-2023-06-0678700000 [13528976, 2438382, 26260517, 26187347, 249748...\n", - "4 rpv2-2023-06-0678800000 [2550974, 157261, 1536526, 1169030, 576861, 10..." + "849273787" ] }, - "execution_count": 25, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "result.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c2e3af9e-9e03-4950-93c0-92792f9ad24b", - "metadata": {}, - "source": [ - "### 5.3.2 Minhash LSH" + "len(deduped_df)" ] }, { "cell_type": "code", "execution_count": 7, - "id": "b11c2f37-3b78-4e1b-a9ff-4a89b38f3604", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "lsh_input_dir = os.path.join(base_dir,\"rpv2-2023-06-and-14-minhash\")\n", - "id_field = 'id'\n", - "output_bucket_dir = expand_outdir_and_mkdir(os.path.join(base_dir,\"fuzzy-dedup-output-2023-06-and-14\"))\n", - "num_bands = 20\n", - "buckets_per_shuffle = 1\n", - "minhash_field = '_minhash_signature'\n", - "minhash_length = 260\n", - "log_dir = os.path.join(base_dir, \"logs\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a243ed7a-9175-488f-8097-5b82c47c5708", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LSH took 10536.635195493698 s\n" - ] - } - ], - "source": [ - "t0 = time.time()\n", - "\n", - "#Load MinHash output\n", - "df = dask_cudf.read_parquet(lsh_input_dir, blocksize=\"2GB\", aggregate_files=True)\n", - "df = df.map_partitions(\n", - " convert_str_id_to_int,\n", - " id_field=id_field,\n", - " meta=cudf.DataFrame(\n", - " {minhash_field: [[1, 2, 3]], \"doc_id\": [1], \"dataset_id\": np.uint32(1)}\n", - " ),\n", - ")\n", - "\n", - "lsh = LSH(\n", - " cache_dir=output_bucket_dir,\n", - " num_hashes=minhash_length,\n", - " num_buckets=num_bands,\n", - " buckets_per_shuffle=buckets_per_shuffle,\n", - " id_fields=[\"dataset_id\", \"doc_id\"],\n", - " minhash_field=minhash_field,\n", - " logger=log_dir,\n", - ")\n", - "\n", - "lsh_result = lsh(DocumentDataset(df))\n", - "print(f\"LSH took {time.time()-t0} s\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "9ea85033-e275-4b62-8351-6c85ac5ac83b", + "id": "19f96f15-1ac6-40ab-9e04-1586531bb55f", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dataset_iddoc_id_bucket_id
0256213913248063708574400
1256213913207920898388082
225621391311428125867198
34217914658358940171254808
4256213913182793165058134
\n", - "
" - ], + "data": { "text/plain": [ - " dataset_id doc_id _bucket_id\n", - "0 256213913 2480637085 74400\n", - "1 256213913 2079208983 88082\n", - "2 256213913 1142812586 7198\n", - "3 4217914658 3589401712 54808\n", - "4 256213913 1827931650 58134" + "1088311520" ] }, - "execution_count": 10, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "lsh_result.df.head()" + "len(input_df)" ] }, { "cell_type": "markdown", - "id": "ff8c79b8-229f-47ee-9e01-a7bf1f317250", + "id": "cdc6350e-2363-4b13-ac67-0cbc23ad981d", + "metadata": {}, + "source": [ + "## 5.3 Inter-snapshot Deduplication" + ] + }, + { + "cell_type": "markdown", + "id": "888c2b15-961f-4a73-a0a3-15474ae4134c", "metadata": {}, "source": [ - "### 5.3.3 Map Buckets" + "So far we have deduplicated a single snapshot from rpv2. A pre-training dataset can include multiple snapshots so we will often need to perform inter-snapshot deduplication. For this tutorial, we will demonstrate deduplication across two snapshots as an example.\n", + "\n", + "We first performed all the above steps for another snapshot `2023-14` and then combined the two deduped datasets into one and stored them in `rpv2-2023-06-and-14-deduped`.\n", + "\n", + "Next, we will perform the fuzzy deduplication on the combined dataset." + ] + }, + { + "cell_type": "markdown", + "id": "2a1445cc-b69c-4007-8f09-75a8eb8f699c", + "metadata": {}, + "source": [ + "### 5.3.1 Compute Minhash" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "2cc909e1-e02e-433d-b6c2-e7f82a137438", + "execution_count": null, + "id": "f1461b61-887c-4099-bd9f-32e79dc5fdbb", "metadata": { "tags": [] }, "outputs": [], "source": [ - "input_data_paths = [os.path.join(base_dir,\"rpv2-2023-06-and-14-deduped\")]\n", - "num_files = None\n", - "text_ddf_blocksize = 256 #The block size for chunking jsonl files for text ddf in mb\n", + "from nemo_curator import MinHash, LSH, BucketsToEdges, ConnectedComponents\n", + "\n", + "from nemo_curator.utils.file_utils import reshard_jsonl" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "efaed1ed-e6d1-4117-9b0b-fe0d20960b60", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "seed = 42\n", + "minhash_length = 260\n", + "char_ngram = 24\n", + "log_dir = expand_outdir_and_mkdir(os.path.join(base_dir, \"logs\"))\n", "id_field = 'id'\n", "text_field = 'raw_content'\n", - "input_bucket_path = os.path.join(base_dir,\"fuzzy-dedup-output-2023-06-and-14/_buckets.parquet\")\n", - "input_bucket_field = '_bucket_id'\n", - "shuffle_type ='tasks'\n", - "log_dir = os.path.join(base_dir, \"logs\")\n", - "output_anchor_docs_with_bk_path = expand_outdir_and_mkdir(os.path.join(base_dir,\"fuzzy-dedup-output-2023-06-and-14/anchor_docs_with_bk.parquet\"))" + "minshah_output_dir = expand_outdir_and_mkdir(os.path.join(base_dir,\"rpv2-2023-06-and-14-minhash\"))" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "3effff2a-f01d-4f33-b495-97455a280a59", + "execution_count": 9, + "id": "b4bf2d09-6601-4bd2-a6f2-f738cffd8885", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "input_data_dir = os.path.join(base_dir,\"rpv2-2023-06-and-14-deduped\")\n", + "\n", + "files = []\n", + "for file in os.listdir(input_data_dir):\n", + " if file.endswith('.part'):\n", + " new_file = file.replace('.part', '.jsonl')\n", + " old_file_path = os.path.join(input_data_dir, file)\n", + " new_file_path = os.path.join(input_data_dir, new_file)\n", + " os.rename(old_file_path, new_file_path)\n", + " files.append(new_file_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "38197174-738e-42a4-a38a-1dbd7d84836d", "metadata": { "tags": [] }, @@ -3644,28 +2289,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of files being read for jaccard calculation = 72797\n", - "ddf_text.npartitions = 23876\n" + "Reading 72797 files\n" ] } ], "source": [ - "# Read .jsonl input data\n", - "ddf_text = get_text_ddf_from_json_path_with_blocksize(\n", - " input_data_paths=input_data_paths,\n", - " num_files=num_files,\n", - " blocksize=text_ddf_blocksize,\n", - " id_field=id_field,\n", - " text_field=text_field,\n", - ")\n", - "\n", - "print(f\"ddf_text.npartitions = {ddf_text.npartitions}\", flush=True)" + "files = [f for f in files if f.endswith(\".jsonl\")]\n", + "df = read_data(\n", + " files,\n", + " file_type=\"jsonl\",\n", + " backend=\"cudf\",\n", + " files_per_partition=2,\n", + " add_filename=False,\n", + ")[[id_field, text_field]]" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "e4e00d8e-170c-4adf-bf34-1ad1b5275760", + "execution_count": 24, + "id": "439add9c-9f51-4481-95cf-456dc5be9fd2", "metadata": { "tags": [] }, @@ -3674,48 +2316,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of ddf_bk partitions = 54\n", - "Mapping Bucket took 1034.9348919391632 s\n" + "Computing minhashes took:6115.702769517899\n" ] } ], "source": [ "t0 = time.time()\n", - "num_workers = get_num_workers(gpu_client)\n", "\n", - "# Read \"_buckets.parquet\"\n", - "ddf_bk = get_bucket_ddf_from_parquet_path(\n", - " input_bucket_path=input_bucket_path, \n", - " num_workers=num_workers\n", - ")\n", - "\n", - "#Run _MapBuckets()\n", - "map_buckets = _MapBuckets(\n", - " id_fields=[\"dataset_id\", \"doc_id\"], \n", - " bucket_field=input_bucket_field, \n", + "# Run MinHash() on input data\n", + "minhasher = MinHash(\n", + " seed=seed,\n", + " num_hashes=minhash_length,\n", + " char_ngrams=char_ngram,\n", + " use_64bit_hash=False,\n", " logger=log_dir,\n", + " id_field=id_field,\n", " text_field=text_field,\n", + " cache_dir=minshah_output_dir\n", ")\n", "\n", - "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(\n", - " documents_df=ddf_text, \n", - " buckets_df=ddf_bk, \n", - " shuffle_type=shuffle_type\n", - ")\n", - "\n", - "#Write to disk\n", - "ddf_anchor_docs_with_bk.to_parquet(\n", - " output_anchor_docs_with_bk_path, \n", - " write_index=False\n", - ")\n", + "result = minhasher(DocumentDataset(df)).df\n", "\n", - "print(f\"Mapping Bucket took {time.time()-t0} s\")" + "print(f\"Computing minhashes took:{time.time()-t0}\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "f0718d8e-5143-458f-ab59-a440adcde8b8", + "execution_count": 25, + "id": "1c998d64-54f8-49b0-8e0c-2e5727596e84", "metadata": { "tags": [] }, @@ -3741,634 +2369,200 @@ " \n", " \n", " \n", - " dataset_id\n", - " doc_id\n", - " anchor_1_dataset_id\n", - " anchor_1_doc_id\n", - " anchor_0_dataset_id\n", - " anchor_0_doc_id\n", - " _output_partition_id\n", + " id\n", + " _minhash_signature\n", " \n", " \n", " \n", " \n", " 0\n", - " 4217914658\n", - " 518211850\n", - " 4217914658\n", - " 518211850\n", - " 256213913\n", - " 491920892\n", - " 2004\n", + " rpv2-2023-06-0678400000\n", + " [36422228, 15993596, 3538361, 16103012, 194100...\n", " \n", " \n", " 1\n", - " 4217914658\n", - " 6364303356\n", - " 256213913\n", - " 2308804621\n", - " 4217914658\n", - " 6364303356\n", - " 4246\n", + " rpv2-2023-06-0678500000\n", + " [34662, 17635, 1112347, 293654, 313382, 160184...\n", " \n", " \n", " 2\n", - " 256213913\n", - " 2103535708\n", - " 4217914658\n", - " 1208111155\n", - " 256213913\n", - " 2103535708\n", - " 4003\n", + " rpv2-2023-06-0678600000\n", + " [15076006, 1801689, 3181854, 2949398, 5699436,...\n", " \n", " \n", " 3\n", - " 256213913\n", - " 1359208912\n", - " 4217914658\n", - " 6342510538\n", - " 256213913\n", - " 1359208912\n", - " 3738\n", + " rpv2-2023-06-0678700000\n", + " [13528976, 2438382, 26260517, 26187347, 249748...\n", " \n", " \n", " 4\n", - " 256213913\n", - " 162316349\n", - " 256213913\n", - " 162316349\n", - " 4217914658\n", - " 1033014280\n", - " 4258\n", + " rpv2-2023-06-0678800000\n", + " [2550974, 157261, 1536526, 1169030, 576861, 10...\n", " \n", " \n", "\n", "" ], "text/plain": [ - " dataset_id doc_id anchor_1_dataset_id anchor_1_doc_id \\\n", - "0 4217914658 518211850 4217914658 518211850 \n", - "1 4217914658 6364303356 256213913 2308804621 \n", - "2 256213913 2103535708 4217914658 1208111155 \n", - "3 256213913 1359208912 4217914658 6342510538 \n", - "4 256213913 162316349 256213913 162316349 \n", - "\n", - " anchor_0_dataset_id anchor_0_doc_id _output_partition_id \n", - "0 256213913 491920892 2004 \n", - "1 4217914658 6364303356 4246 \n", - "2 256213913 2103535708 4003 \n", - "3 256213913 1359208912 3738 \n", - "4 4217914658 1033014280 4258 " + " id _minhash_signature\n", + "0 rpv2-2023-06-0678400000 [36422228, 15993596, 3538361, 16103012, 194100...\n", + "1 rpv2-2023-06-0678500000 [34662, 17635, 1112347, 293654, 313382, 160184...\n", + "2 rpv2-2023-06-0678600000 [15076006, 1801689, 3181854, 2949398, 5699436,...\n", + "3 rpv2-2023-06-0678700000 [13528976, 2438382, 26260517, 26187347, 249748...\n", + "4 rpv2-2023-06-0678800000 [2550974, 157261, 1536526, 1169030, 576861, 10..." ] }, - "execution_count": 15, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ddf_anchor_docs_with_bk.head()" + "result.head()" ] }, { "cell_type": "markdown", - "id": "fca84869-9991-429c-baa1-bccadd15cafa", + "id": "c2e3af9e-9e03-4950-93c0-92792f9ad24b", "metadata": {}, "source": [ - "### 6.8.4 Jaccard Shuffle" + "### 5.3.2 Minhash LSH" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "8cbce892-ec9c-46e3-9fbe-09f9a243a7aa", + "execution_count": 7, + "id": "b11c2f37-3b78-4e1b-a9ff-4a89b38f3604", "metadata": { "tags": [] }, "outputs": [], "source": [ - "log_dir = os.path.join(base_dir, \"logs\")\n", - "input_anchor_docs_with_bk_path = os.path.join(base_dir,\"fuzzy-dedup-output-2023-06-and-14/anchor_docs_with_bk.parquet\")\n", - "output_shuffled_docs_path = expand_outdir_and_mkdir(\n", - " os.path.join(base_dir, \"fuzzy-dedup-output-2023-06-and-14/shuffled_docs.parquet\")\n", - ")\n", - "bucket_mapping_ddf_blocksize = 256\n", - "parts_per_worker = 16\n", - "bucket_parts_per_worker = 256\n", + "lsh_input_dir = os.path.join(base_dir,\"rpv2-2023-06-and-14-minhash\")\n", "id_field = 'id'\n", - "text_field = 'raw_content'" + "output_bucket_dir = expand_outdir_and_mkdir(os.path.join(base_dir,\"fuzzy-dedup-output-2023-06-and-14\"))\n", + "num_bands = 20\n", + "buckets_per_shuffle = 1\n", + "minhash_field = '_minhash_signature'\n", + "minhash_length = 260\n", + "log_dir = os.path.join(base_dir, \"logs\")" ] }, { "cell_type": "code", "execution_count": 8, - "id": "1acd3e4a-0310-4413-98b2-07cd7c74ee57", + "id": "a243ed7a-9175-488f-8097-5b82c47c5708", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/1 [00:00=24` to use ~5 word ngrams or greater.\n", "- `use_64bit_hash`:Whether to use 64bit or 32bit hash function\n", "- `id_field`: Key in input file for identifying document ID\n", "- `text_field`: Key in input file which contains document text.\n", @@ -1109,9 +1308,9 @@ "#Relevant parameters\n", "minhash_id_field = 'id'\n", "minhash_text_field = 'text'\n", - "seed = 10\n", + "seed = 10 # Using the same value as the wrapper above for consistency\n", "minhash_length = 260\n", - "char_ngram = 5\n", + "char_ngram = 24\n", "use_64bit_hash = False\n", "files_per_partition = 2\n", "\n", @@ -1192,11 +1391,11 @@ "id": "0bce0f80", "metadata": {}, "source": [ - "### 5.2 LSH\n", + "### 5.3 LSH\n", "`LSH()` implements LSH algorithm which includes the following steps:\n", "1. Divide the minhash array into `X` different portions. \n", "2. For each portions, hash the minhash values into buckets. One document will be assigned to `X` buckets.\n", - "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar, the result of LSH will have more false positive as compared to false negative. The false positive cases will be filtered in following modules, namely jaccard compute.\n", + "3. Documents within the same bucket will be deemed similar. Since every document will be assigned `X` buckets and as long as two documents share 1 or more buckets they are deemed similar.\n", "\n", "Arguments include:\n", "- `minhash_length`:Length of minhash signature. Must be consistent with `MinHash()`\n", @@ -1217,9 +1416,7 @@ }, "outputs": [], "source": [ - "from nemo_curator import LSH\n", - "from nemo_curator.utils.fuzzy_dedup_utils.id_mapping import \\\n", - " convert_str_id_to_int" + "from nemo_curator import LSH" ] }, { @@ -1279,13 +1476,6 @@ "\n", "#Load MinHash output\n", "df = dask_cudf.read_parquet(lsh_input_data_path, blocksize=\"2GB\", aggregate_files=True, backend = \"cudf\")\n", - "df = df.map_partitions(\n", - " convert_str_id_to_int,\n", - " id_field=lsh_id_field,\n", - " meta=cudf.DataFrame(\n", - " {minhash_field: [[1, 2, 3]], \"doc_id\": [1], \"dataset_id\": np.uint32(1)}\n", - " ),\n", - ")\n", "\n", "#Run LSH()\n", "lsh = LSH(\n", @@ -1293,7 +1483,7 @@ " num_hashes=minhash_length,\n", " num_buckets=num_bands,\n", " buckets_per_shuffle=buckets_per_shuffle,\n", - " id_fields=[\"dataset_id\", \"doc_id\"],\n", + " id_fields=[\"id\"],\n", " minhash_field=minhash_field,\n", " logger=lsh_log_dir,\n", ")\n", @@ -1329,20 +1519,14 @@ "id": "f952f074", "metadata": {}, "source": [ - "### 5.3 Jaccard Shuffle\n", - "In this section, we will be using `_MapBucket()` and `_Shuffle()`.\n", - "\n", - "For `_MapBucket()`, it is designed to take input text data in jsonl format and bucket information which is output of LSH, map the documents to their respective buckets, and write the resulting DataFrame containing the anchor documents and their associated bucket information to a parquet file. Arguments include:\n", - "- `id_field`: Key in input .jsonl file for identifying document ID\n", - "- `text_field`: Key in input .jsonl file which contains document text.\n", - "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`.\n", - "- `num_anchors`: Number of anchors (document in the same buckets) to be output\n", + "### 5.4 Buckets to Edges\n", + "In this section, we will be using `BucketsToEdges()`\n", "\n", + "`BucketsToEdges()` is designed to take the bucket information which is output of LSH, and create an edgelist dataset where documents with the same `_bucket_id` are connected with an edge between them. This edgelist can then be passed on the connected components to identify groups of similar documents across buckets. Since the false positive check is skipped all documents within a bucket are considered to be duplicates of each other and assigned a jaccard similarity of 1.0 to avoid edge removal during the next step.\n", "\n", - "For `_Shuffle()`, it perform a shuffling operation on the documents based on their bucket assignments, output in .parquet format. This shuffling operation is a crucial step in the deduplication process, as it helps distribute similar documents across different partitions or workers, enabling efficient parallel processing and deduplication in subsequent steps. Arguments include:\n", - "- `id_fields`: Columns in `_buckets.parquet` that maps to original `id` in .jsonl data file. In this example, it is `[\"dataset_id\", \"doc_id\"]`\n", - "- `text_field`: Key in input .jsonl file which contains document text.\n", - "- `int_to_str_id`: Key in input .jsonl file for identifying document ID\n" + "- `id_field`: Key in input .jsonl file for identifying document ID\n", + "- `bucket_field`: Key in input _buckets.parquet which contains `bucket_id`\n", + "- `cache_dir`: If specified, the intermediate result will be output to the `cache_dir`.\n" ] }, { @@ -1354,12 +1538,7 @@ }, "outputs": [], "source": [ - "from nemo_curator.utils.fuzzy_dedup_utils.io_utils import (\n", - " get_bucket_ddf_from_parquet_path,\n", - " get_text_ddf_from_json_path_with_blocksize,\n", - ")\n", - "from nemo_curator.modules.fuzzy_dedup._mapbuckets import _MapBuckets\n", - "from nemo_curator.modules.fuzzy_dedup._shuffle import _Shuffle" + "from nemo_curator import BucketsToEdges" ] }, { @@ -1380,31 +1559,19 @@ "outputs": [], "source": [ "#Input\n", - "input_data_paths = [minhash_data_path]\n", "input_bucket_path = lsh_output_dir\n", "\n", "#Output\n", - "jaccard_shuffle_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_shuffle\")\n", - "output_anchor_docs_with_bk_path = os.path.join(jaccard_shuffle_base_output_path, \"anchor_docs_with_bk.parquet\")\n", - "input_anchor_docs_with_bk_dir = output_anchor_docs_with_bk_path\n", - "jaccard_shuffle_log_path = os.path.join(jaccard_shuffle_base_output_path,\"log\")\n", - "output_shuffled_docs_path = os.path.join(jaccard_shuffle_base_output_path, \"shuffled_docs.parquet\")\n", - "\n", - "#Relevant parameters for _MapBucket()\n", - "text_ddf_blocksize = 256\n", - "bucket_mapping_ddf_blocksize = 256\n", - "num_files = None\n", - "shuffle_type ='tasks'\n", - "input_bucket_field = '_bucket_id'\n", + "buckets_to_edges_base_output_path = os.path.join(data_dir,\"fuzzy/buckets_to_edges\")\n", + "edgelist_output_dir = os.path.join(buckets_to_edges_base_output_path, \"data\")\n", + "buckets_to_edges_log_path = os.path.join(buckets_to_edges_base_output_path,\"log\")\n", + "\n", + "#Relevant parameters for BucketsToEdges()\n", "id_field = 'id'\n", - "text_field = 'text'\n", "\n", - "#Relevant parameters for _Shuffle()\n", - "shuffle_id_fields=[\"dataset_id\", \"doc_id\"]\n", - "int_to_str_id='id'\n", "\n", - "!mkdir -p {jaccard_shuffle_base_output_path}\n", - "!mkdir -p {jaccard_shuffle_log_path}" + "!mkdir -p {edgelist_output_path}\n", + "!mkdir -p {buckets_to_edges_log_path}" ] }, { @@ -1425,27 +1592,15 @@ "outputs": [], "source": [ "t0 = time.time()\n", - "num_workers = get_num_workers(client)\n", "\n", - "# Read .jsonl input data\n", - "ddf_text = get_text_ddf_from_json_path_with_blocksize(\n", - " input_data_paths=input_data_paths,\n", - " num_files=num_files,\n", - " blocksize=text_ddf_blocksize,\n", - " id_field=id_field,\n", - " text_field=text_field,\n", - ")\n", "# Read \"_buckets.parquet\"\n", - "ddf_bk = get_bucket_ddf_from_parquet_path(input_bucket_path=input_bucket_path, num_workers=num_workers)\n", + "ddf_bk = DocumentDataset.read_parquet(input_bucket_path, backend=\"cudf\")\n", "\n", "#Run _MapBuckets()\n", - "map_buckets = _MapBuckets(id_fields=shuffle_id_fields, bucket_field=input_bucket_field, logger=jaccard_shuffle_log_path)\n", - "ddf_anchor_docs_with_bk = map_buckets.map_buckets_with_anchors(documents_df=ddf_text, buckets_df=ddf_bk, shuffle_type=shuffle_type)\n", - "\n", - "#Write to disk\n", - "ddf_anchor_docs_with_bk.to_parquet(output_anchor_docs_with_bk_path, write_index=False)\n", + "buckets_to_edges = BucketsToEdges(cache_dir=edgelist_output_dir, id_fields=input_id_field, logger=buckets_to_edges_log_path)\n", + "res = buckets_to_edges(ddf_bk)\n", "\n", - "print(f\"Time taken for Bucket Mapping:{time.time()-t0} s\")" + "print(f\"Time taken for Bucket->Edgelist:{time.time()-t0} s\")" ] }, { @@ -1465,205 +1620,8 @@ }, "outputs": [], "source": [ - "# map_bucket_res = pd.read_parquet(output_anchor_docs_with_bk_path)\n", - "# map_bucket_res.head()" - ] - }, - { - "cell_type": "markdown", - "id": "1487b1ad", - "metadata": {}, - "source": [ - "**[Optional]** Remove previous Jaccard Shuffle results. Run only when there are files under the Jaccard Shuffle output path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b414f703", - "metadata": {}, - "outputs": [], - "source": [ - "#!rm -r {output_shuffled_docs_path}" - ] - }, - { - "cell_type": "markdown", - "id": "f33a6782", - "metadata": {}, - "source": [ - "Run Jaccard Shuffle" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86d1b3e5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "t0 = time.time()\n", - "\n", - "#Run _Shuffle() on results of _MapBucket()\n", - "shuffle = _Shuffle(\n", - " id_fields=shuffle_id_fields,\n", - " text_field=text_field,\n", - " int_to_str_id=int_to_str_id,\n", - " logger=jaccard_shuffle_log_path\n", - ")\n", - "shuffle.shuffle_docs_on_buckets(\n", - " documents_df=ddf_text,\n", - " bucket_w_anchors_path=input_anchor_docs_with_bk_dir,\n", - " output_shuffled_docs_path=output_shuffled_docs_path,\n", - " bucket_mapping_df_blocksize=bucket_mapping_ddf_blocksize,\n", - "# parts_per_worker=1,\n", - "# bucket_parts_per_worker=8,\n", - " partition_on=\"_output_partition_id\",\n", - ")\n", - "\n", - "print(f\"Time taken for Jaccard Shuffle = {time.time()-t0} s\")" - ] - }, - { - "cell_type": "markdown", - "id": "86b06cb5", - "metadata": {}, - "source": [ - "**[Optional]** Verify result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b51a5fb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# jaccard_shuffle_res = pd.read_parquet(os.path.join(output_shuffled_docs_path,\"_output_partition_id=0/batch_1_1.parquet\"))\n", - "# jaccard_shuffle_res.head()" - ] - }, - { - "cell_type": "markdown", - "id": "b8644e51", - "metadata": {}, - "source": [ - "### 5.4 Jaccard Compute\n", - "We will be using `JaccardSimilarity()`.This is to computes the Jaccard similarity between document pairs. Result is a parquet dataset consisting of document id pair along with their Jaccard similarity score. To compute Jaccard similarity between two documents, we first convert the document into sets of n-grams and then compute the Jaccard similarity of the two sets.\n", - "\n", - "Arguments include:\n", - "- `id_field`: Column in input .parquet file identifying document ID\n", - "- `text_field`: Column in input .parquet file identifying document text\n", - "- `anchor_id_fields`: Column in input .parquet file identifying anchors. This can be generated by specifying number of anchor used in `_MapBucket` whose default value is 2\n", - "- `ngram_width`: n-gram used" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1a532a2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from nemo_curator import JaccardSimilarity" - ] - }, - { - "cell_type": "markdown", - "id": "c9e65975", - "metadata": {}, - "source": [ - "Define parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "291d3aaa", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#Input\n", - "shuffled_docs_path = output_shuffled_docs_path\n", - "\n", - "#Output\n", - "jaccard_compute_base_output_path = os.path.join(data_dir,\"fuzzy/jaccard_compute\")\n", - "jaccard_compute_output_results_path = os.path.join(jaccard_compute_base_output_path, \"jaccard_similarity_results.parquet\")\n", - "\n", - "#Relevant parameters\n", - "id_field = 'id'\n", - "text_field = 'text'\n", - "ngram_size = 5\n", - "num_anchors = 2\n", - "\n", - "!mkdir -p {jaccard_compute_base_output_path}" - ] - }, - { - "cell_type": "markdown", - "id": "9341b58c", - "metadata": {}, - "source": [ - "Run Jaccard Compute" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9b1b9bdd", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# enable_spilling()\n", - "# client.run(enable_spilling)\n", - "\n", - "print(\"Running jaccard compute script\", flush=True)\n", - "t0 = time.time()\n", - "\n", - "jaccard = JaccardSimilarity(\n", - " id_field=id_field,\n", - " text_field=text_field,\n", - " anchor_id_fields=[f\"anchor_{i}_{id_field}\" for i in range(num_anchors)],\n", - " ngram_width=ngram_size,\n", - ")\n", - "\n", - "#Load and run Jaccard compute\n", - "result_df = jaccard.jaccard_compute(shuffled_docs_path)\n", - "\n", - "result_df.to_parquet(jaccard_compute_output_results_path, write_index=False, write_metadata_file=False)\n", - "\n", - "print(f\"Time taken for Jaccard Computing: {time.time()-t0}\")" - ] - }, - { - "cell_type": "markdown", - "id": "bb740d30", - "metadata": {}, - "source": [ - "**[Optional]** Verify output. You might see that there are repeated `id_x` and `id_y` pairs. This is expected as a pair of similar documents is likely to share numerous same buckets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a41d1f09", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# jaccard_compute_res = pd.read_parquet(jaccard_compute_output_results_path)\n", - "# jaccard_compute_res.head()" + "# edgelist_res = pd.read_parquet(os.path.join(edgelist_output_dir, \"_edges.parquet\"))\n", + "# edgelist_res.head()" ] }, { @@ -1672,13 +1630,13 @@ "metadata": {}, "source": [ "### 5.5 Connected Components\n", - "This section uses `ConnectedComponents()`.This section takes a dataset consisting of document pairs and their corresponding jaccard similarity to construct a non-directed graph. A edge will be form between documents whose Jaccard similarity is higher than the threshold (0.8 in this example). It will then identify the connected components in this graph. Documents within the same connected components are deemed duplicated\n", + "This section uses `ConnectedComponents()`.This section takes a dataset consisting of document pairs and their corresponding jaccard similarity to construct a non-directed graph. A edge will be formed between documents whose Jaccard similarity is higher than the threshold. It will then identify the connected components in this graph. Documents within the same connected components are deemed duplicated.\n", "\n", "Arguments include:\n", - "- `cache_dir`:Output path for intermediate results\n", - "- `jaccard_pairs_path`:Input path for `jaccard_similarity_results.parquet`\n", - "- `id_field`:prefix of ID column in `jaccard_similarity_results.parquet`\n", - "- `jaccard_threshold`:Threshold to determine if an edge exists between two documents" + "- `cache_dir`: Output path for intermediate results\n", + "- `jaccard_pairs_path`: Input path for `jaccard_similarity_results.parquet`\n", + "- `id_field`: prefix of ID column in `jaccard_similarity_results.parquet`\n", + "- `jaccard_threshold`: Threshold to determine if an edge exists between two documents" ] }, { @@ -1711,7 +1669,7 @@ "outputs": [], "source": [ "#Input\n", - "jaccard_pairs_path = jaccard_compute_output_results_path\n", + "jaccard_pairs_path = edgelist_output_dir\n", "\n", "#Output\n", "connected_component_base_output_path = os.path.join(data_dir,\"fuzzy/cc\")\n", @@ -1720,7 +1678,6 @@ "\n", "#Relevant parameters\n", "id_field = 'id'\n", - "jaccard_threshold = 0.8\n", "\n", "!mkdir -p {connected_component_base_output_path}" ] @@ -1748,7 +1705,6 @@ " cache_dir=connected_component_cache_dir,\n", " jaccard_pairs_path=jaccard_pairs_path,\n", " id_field=id_field,\n", - " jaccard_threshold=jaccard_threshold,\n", ")\n", "\n", "#Load and run connected component\n", @@ -1794,8 +1750,7 @@ }, "outputs": [], "source": [ - "# cc_compute_res['doc_id'] = cc_compute_res['doc_id'].astype(str)\n", - "# cc_compute_res.groupby('group')['doc_id'].agg(lambda x: ', '.join(x)).reset_index()" + "# cc_compute_res.groupby('group')[input_id_field].agg(list).reset_index()" ] }, { @@ -1815,7 +1770,7 @@ }, "outputs": [], "source": [ - "#Repalce ??? with the group number you want to check\n", + "# Replace ??? with the group number you want to check\n", "# cc_compute_res[cc_compute_res['group']==???].head()" ] }, @@ -1847,10 +1802,10 @@ } ], "source": [ - "# Repalce 'ID1' and 'ID2' with IDs you want to check\n", - "# The output is an example of fuzzy duplicates \n", - "\n", - "# jaccard_shuffle_res[jaccard_shuffle_res['id'].isin(['ID1','ID2'])]['text'].unique()" + "# Replace 'ID1' and 'ID2' with IDs you want to check\n", + "#The output is an example of fuzzy duplicates \n", + "# df = input_dataset.df.compute()\n", + "# df[df['id'].isin(['ID1','ID2'])]['text'].unique()" ] }, { @@ -1914,126 +1869,6 @@ "```\n" ] }, - { - "cell_type": "markdown", - "id": "f36436f3", - "metadata": {}, - "source": [ - "### 5.6 Fuzzy deduplication wrapper" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb52ec06", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from nemo_curator import FuzzyDuplicates, FuzzyDuplicatesConfig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "625c1828", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#Input\n", - "fuzzy_dedup_data_path = added_id_output_path\n", - "#Output\n", - "fuzzy_dedup_base_output_path = os.path.join(data_dir,\"fuzzy_wrapper\")\n", - "fuzzy_dedup_log_dir = os.path.join(fuzzy_dedup_base_output_path,'log')\n", - "fuzzy_dedup_cache_dir = os.path.join(fuzzy_dedup_base_output_path,'cache')\n", - "fuzzy_dedup_output_dir = os.path.join(fuzzy_dedup_base_output_path,'data')\n", - "#Specify dataset name\n", - "dataset_name = 'TH_wikipedia'\n", - "\n", - "#Relevant parameters\n", - "id_field = 'id'\n", - "text_field = 'text'\n", - "filetype = \"parquet\"\n", - "\n", - "!mkdir -p {fuzzy_dedup_base_output_path}\n", - "!mkdir -p {fuzzy_dedup_log_dir}\n", - "!mkdir -p {fuzzy_dedup_cache_dir}\n", - "!mkdir -p {fuzzy_dedup_output_dir}" - ] - }, - { - "cell_type": "markdown", - "id": "cb76d8e5", - "metadata": {}, - "source": [ - "**[Optional]** If the cache folder is not empty, please CLEAR the folder before proceeding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7fb4c4c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#!rm -r {fuzzy_dedup_cache_dir}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2368443f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "with dask.config.set({\"dataframe.backend\": 'cudf'}):\n", - " \n", - " t0 = time.time()\n", - " \n", - " input_dataset = DocumentDataset.read_json(fuzzy_dedup_data_path, backend='cudf')\n", - "\n", - " fuzzy_dedup_config = FuzzyDuplicatesConfig(\n", - " cache_dir=fuzzy_dedup_cache_dir,\n", - " id_field=id_field,\n", - " text_field=text_field,\n", - " seed=seed, #Use the seed set in Minhash section for consistency\n", - " char_ngrams=5,\n", - " num_buckets=20,\n", - " hashes_per_bucket=13,\n", - " use_64_bit_hash=False,\n", - " buckets_per_shuffle=5,\n", - " false_positive_check=True,\n", - " num_anchors=2,\n", - " jaccard_threshold=0.8,\n", - " )\n", - " fuzzy_dup = FuzzyDuplicates(logger=fuzzy_dedup_log_dir, config=fuzzy_dedup_config)\n", - " duplicates = fuzzy_dup(dataset=input_dataset)\n", - " \n", - " duplicates.to_parquet(fuzzy_dedup_output_dir, write_to_filename=False)\n", - " \n", - " print(f\"Time taken for Connected Component: {time.time()-t0} s\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14bfe3bc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fuzzy_dedup_res = pd.read_parquet(fuzzy_dedup_output_dir)\n", - "fuzzy_dedup_res.head()" - ] - }, { "cell_type": "markdown", "id": "d2726cf9", @@ -2106,51 +1941,6 @@ "]" ] }, - { - "cell_type": "markdown", - "id": "f55d6737", - "metadata": {}, - "source": [ - "For result of fuzzy deduplication, we need to first reconstructed document ID by combining `dataset_id` and `doc_id`, then use the reconstructed `ID` for removal" - ] - }, - { - "cell_type": "markdown", - "id": "3b9c122d", - "metadata": {}, - "source": [ - "**[Optional]** Uncomment the cell to use result from step by step fuzzy deduplication" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6a1bb0a", - "metadata": {}, - "outputs": [], - "source": [ - "# #List of id_prefix used in Add ID\n", - "# base_ids = [id_prefix]\n", - "\n", - "# #Obtain a mapping between `dataset_id` and `id_prefix`\n", - "# df = cudf.DataFrame()\n", - "# df['base_id'] = [base_id for base_id in base_ids]\n", - "# df['dataset_id'] = df['base_id'].hash_values()\n", - "# df_pd = df.to_pandas()\n", - "# mapping = {\n", - "# hashed_id: base_id\n", - "# for base_id, hashed_id in zip(df_pd['base_id'], df_pd['dataset_id'])\n", - "# }\n", - "\n", - "# #Load result of fuzzy deduplication \n", - "# fuzzy_duplicates = pd.read_parquet(connected_component_output_path)\n", - "# #Reconstruct the original document ID\n", - "# fuzzy_duplicates['id']=fuzzy_duplicates.apply(lambda x: f\"{mapping[x['dataset_id']]}-{x['doc_id']:010d}\", axis=1)\n", - "\n", - "# #Generate list of near duplicate document ID\n", - "# fuzzy_docs_to_remove = fuzzy_duplicates.drop_duplicates(subset=['group'], keep='first')" - ] - }, { "cell_type": "code", "execution_count": null, @@ -2299,18 +2089,10 @@ "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024-08-12 06:36:51,616 - distributed.scheduler - WARNING - Removing worker 'tcp://127.0.0.1:32917' caused the cluster to lose already computed task(s), which will be recomputed elsewhere: {('getitem-5b69d236ac9974e9fb86010ffc64382a', 0), ('getitem-1a1421e1fc0bebcfdb81496a35f59d59', 0), ('getitem-a531838794cbb6793b5455275c088d56', 0), ('getitem-5a479f5a8ba45819d7bc110e6f66c5cf', 0), ('getitem-20cb1fb330d399835eab7d541c90d9ad', 0), ('getitem-ea8820d11bd559a47001726946b401f1', 0), ('getitem-dc3a1400f3d825aa608fea3f19009402', 0), ('getitem-fc7ee0a305222d3cbc86116635f8f1b7', 0), ('getitem-5a35ddcf8be5c285f2cc9e07ba4168d6', 0), ('getitem-9f6e0b039afa9a3a892b2eee42fff9ff', 0), ('getitem-aef58cc24b78e9deb456d9854d8056db', 0), ('getitem-cf46f299cd36329b1ec712d5fd751b3a', 0), ('getitem-36157dd00770b4907cf863f121981541', 0), ('getitem-2d4c129c73f6e4bd0add5175ea806475', 0)} (stimulus_id='handle-worker-cleanup-1723444611.6157267')\n" - ] - } - ], + "outputs": [], "source": [ - "cluster = LocalCluster(n_workers=10, processes=True, memory_limit='16GB')\n", - "client = Client(cluster)" + "client = get_client(cluster_type=\"cpu\", n_workers=10, processes=True, memory_limit='16GiB')\n", + "client" ] }, { @@ -2420,7 +2202,7 @@ "score_fields = get_score_fields(filter_pipeline)\n", "\n", "# Load dataset\n", - "dataset = DocumentDataset.read_parquet(HF_input_data_dir, backend='pandas', add_filename=True)\n", + "dataset = DocumentDataset.read_parquet(HF_input_data_dir, files_per_partition=1, blocksize=None, backend='pandas', add_filename=True)\n", "\n", "\n", "# Iterate through filters. For each filter, the low quality document will be removed from the dataset and output to corresponding folder for analysis\n", @@ -2487,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": null, "id": "12508f5e", "metadata": { "tags": [] @@ -2501,7 +2283,7 @@ { "cell_type": "code", "execution_count": null, - "id": "83e4aed1", + "id": "181bb75c-88fb-4c18-a010-b5df36c6c781", "metadata": {}, "outputs": [], "source": [] @@ -2523,7 +2305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.8" } }, "nbformat": 4,