add more files

Signed-off-by: Sarah Yurick <[email protected]>
NVIDIA · Feb 5, 2025 · 910cbf2 · 910cbf2
1 parent 88675f6
commit 910cbf2
Show file tree

Hide file tree

Showing 17 changed files with 1,202 additions and 159 deletions.
diff --git a/docs/user-guide/api/classifiers.rst b/docs/user-guide/api/classifiers.rst
@@ -14,6 +14,12 @@ Classifiers
 .. autoclass:: nemo_curator.classifiers.FineWebEduClassifier
     :members:
 
+.. autoclass:: nemo_curator.classifiers.FineWebMixtralClassifier
+    :members:
+
+.. autoclass:: nemo_curator.classifiers.FineWebNemotronClassifier
+    :members:
+
 .. autoclass:: nemo_curator.classifiers.AegisClassifier
     :members:
 

diff --git a/docs/user-guide/cpuvsgpu.rst b/docs/user-guide/cpuvsgpu.rst
@@ -71,6 +71,7 @@ The following NeMo Curator modules are GPU based.
   * Quality Classification
   * AEGIS and Instruction-Data-Guard Safety Models
   * FineWeb Educational Content Classification
+  * FineWeb Mixtral and FineWeb Nemotron-4 Educational Models
   * Content Type Classification
   * Prompt Task/Complexity Classification
 

diff --git a/docs/user-guide/distributeddataclassification.rst b/docs/user-guide/distributeddataclassification.rst
@@ -31,6 +31,10 @@ Here, we summarize why each is useful for training an LLM:
 
 - The **FineWeb Educational Content Classifier** focuses on identifying and prioritizing educational material within datasets. This classifier is especially useful for training LLMs on specialized educational content, which can improve their performance on knowledge-intensive tasks. Models trained on high-quality educational content demonstrate enhanced capabilities on academic benchmarks such as MMLU and ARC, showcasing the classifier's impact on improving the knowledge-intensive task performance of LLMs.
 
+- The **FineWeb Mixtral Educational Classifier** TODO.
+
+- The **FineWeb Nemotron-4 Educational Classifier** TODO.
+
 - The **Content Type Classifier** is designed to categorize documents into one of 11 distinct speech types based on their content. It analyzes and understands the nuances of textual information, enabling accurate classification across a diverse range of content types.
 
 - The **Prompt Task/Complexity Classifier** is a multi-headed model which classifies English text prompts across task types and complexity dimensions.
@@ -236,6 +240,90 @@ For example, to create a dataset with only highly educational content (scores 4
     high_edu_dataset = result_dataset[result_dataset["fineweb-edu-score-int"] >= 4]
     high_edu_dataset.to_json("high_educational_content/")
 
+NemoCurator FineWeb Mixtral Edu Classifier
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TODO
+The FineWeb Mixtral Edu Classifier is designed to identify and prioritize educational content within a dataset.
+Educational content classification helps identify and prioritize educational material within datasets, which is particularly useful for creating specialized datasets like FineWeb-Edu.
+These datasets can be used to train LLMs with a focus on educational content, potentially improving their performance on knowledge-intensive tasks.
+
+To use the FineWeb Mixtral Edu Classifier, you can follow this example:
+
+.. code-block:: python
+
+    from nemo_curator.classifiers import FineWebMixtralClassifier
+
+    files = get_all_files_paths_under("web_documents/")
+    input_dataset = DocumentDataset.read_json(files, backend="cudf")
+
+    classifier = FineWebMixtralClassifier(
+        batch_size=256,
+        text_field="text",
+        pred_column="fineweb-mixtral-score",
+        int_column="fineweb-mixtral-score-int",
+        label_column="fineweb-mixtral-score-label",
+    )
+    result_dataset = classifier(dataset=input_dataset)
+
+    result_dataset.to_json("educational_content/")
+
+This classifier uses a model based on the `Snowflake Arctic-embed-m <https://huggingface.co/Snowflake/snowflake-arctic-embed-m>`_ embedding model with a linear regression layer on top.
+It assigns an educational score to each document on a scale from 0 to 5, where higher scores indicate more educational content.
+
+The ``pred_column`` will contain the raw floating-point scores, while the ``int_column`` will contain the rounded integer scores.
+The ``label_column`` identifies text as high quality if it scores higher than 2.5 and low quality otherwise.
+You can filter the results based on these scores to create datasets with varying levels of educational content.
+
+For example, to create a dataset with only highly educational content (scores 4 and 5):
+
+.. code-block:: python
+
+    high_edu_dataset = result_dataset[result_dataset["fineweb-mixtral-score-int"] >= 4]
+    high_edu_dataset.to_json("high_educational_content/")
+
+NemoCurator FineWeb Nemotron-4 Edu Classifier
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TODO
+The FineWeb Mixtral Edu Classifier is designed to identify and prioritize educational content within a dataset.
+Educational content classification helps identify and prioritize educational material within datasets, which is particularly useful for creating specialized datasets like FineWeb-Edu.
+These datasets can be used to train LLMs with a focus on educational content, potentially improving their performance on knowledge-intensive tasks.
+
+To use the FineWeb Nemotron-4 Edu Classifier, you can follow this example:
+
+.. code-block:: python
+
+    from nemo_curator.classifiers import FineWebNemotronClassifier
+
+    files = get_all_files_paths_under("web_documents/")
+    input_dataset = DocumentDataset.read_json(files, backend="cudf")
+
+    classifier = FineWebNemotronClassifier(
+        batch_size=256,
+        text_field="text",
+        pred_column="fineweb-nemotron-score",
+        int_column="fineweb-nemotron-score-int",
+        label_column="fineweb-nemotron-score-label",
+    )
+    result_dataset = classifier(dataset=input_dataset)
+
+    result_dataset.to_json("educational_content/")
+
+This classifier uses a model based on the `Snowflake Arctic-embed-m <https://huggingface.co/Snowflake/snowflake-arctic-embed-m>`_ embedding model with a linear regression layer on top.
+It assigns an educational score to each document on a scale from 0 to 5, where higher scores indicate more educational content.
+
+The ``pred_column`` will contain the raw floating-point scores, while the ``int_column`` will contain the rounded integer scores.
+The ``label_column`` identifies text as high quality if it scores higher than 2.5 and low quality otherwise.
+You can filter the results based on these scores to create datasets with varying levels of educational content.
+
+For example, to create a dataset with only highly educational content (scores 4 and 5):
+
+.. code-block:: python
+
+    high_edu_dataset = result_dataset[result_dataset["fineweb-nemotron-score-int"] >= 4]
+    high_edu_dataset.to_json("high_educational_content/")
+
 Content Type Classifier
 ^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/examples/classifiers/README.md b/examples/classifiers/README.md
@@ -8,6 +8,8 @@ The Python scripts in this directory demonstrate how to run classification on yo
 - AEGIS Safety Models
 - Instruction-Data-Guard Model
 - FineWeb Educational Content Classifier
+- FineWeb Mixtral Educational Classifier
+- FineWeb Nemotron-4 Educational Classifier
 - Content Type Classifier
 - Prompt Task/Complexity Classifier
 

diff --git a/examples/classifiers/fineweb_mixtral_example.py b/examples/classifiers/fineweb_mixtral_example.py
@@ -42,8 +42,7 @@ def main(args):
 
     global_et = time.time()
     print(
-        # TODO
-        f"Total time taken for classifier inference: {global_et-global_st} s",
+        f"Total time taken for FineWeb Mixtral Edu Classifier inference: {global_et-global_st} s",
         flush=True,
     )
 

diff --git a/examples/classifiers/fineweb_nemotron_example.py b/examples/classifiers/fineweb_nemotron_example.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import time
+
+from nemo_curator.classifiers import FineWebNemotronClassifier
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.utils.distributed_utils import get_client
+from nemo_curator.utils.script_utils import ArgumentHelper
+
+
+def main(args):
+    global_st = time.time()
+
+    # Input can be a string or list
+    input_file_path = "/path/to/data"
+    output_file_path = "./"
+
+    client_args = ArgumentHelper.parse_client_args(args)
+    client_args["cluster_type"] = "gpu"
+    client = get_client(**client_args)
+
+    input_dataset = DocumentDataset.read_json(
+       input_file_path, backend="cudf", add_filename=True
+    )
+
+    fineweb_nemotron_classifier = FineWebNemotronClassifier()
+    result_dataset = fineweb_nemotron_classifier(dataset=input_dataset)
+    result_dataset.to_json(output_path=output_file_path, write_to_filename=True)
+
+    global_et = time.time()
+    print(
+        f"Total time taken for FineWeb Nemotron-4 Edu Classifier inference: {global_et-global_st} s",
+        flush=True,
+    )
+
+    client.close()
+
+
+def attach_args(
+    parser=argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    ),
+):
+    argumentHelper = ArgumentHelper(parser)
+    argumentHelper.add_distributed_classifier_cluster_args()
+
+    return argumentHelper.parser
+
+
+if __name__ == "__main__":
+    main(attach_args().parse_args())
diff --git a/nemo_curator/classifiers/__init__.py b/nemo_curator/classifiers/__init__.py
@@ -18,8 +18,7 @@
 from .aegis import AegisClassifier, InstructionDataGuardClassifier
 from .content_type import ContentTypeClassifier
 from .domain import DomainClassifier, MultilingualDomainClassifier
-from .fineweb_edu import FineWebEduClassifier
-from .fineweb_mixtral import FineWebMixtralClassifier
+from .fineweb_edu import FineWebEduClassifier, FineWebMixtralClassifier, FineWebNemotronClassifier
 from .prompt_task_complexity import PromptTaskComplexityClassifier
 from .quality import QualityClassifier
 
@@ -31,6 +30,7 @@
     "InstructionDataGuardClassifier",
     "FineWebEduClassifier",
     "FineWebMixtralClassifier",
+    "FineWebNemotronClassifier",
     "ContentTypeClassifier",
     "PromptTaskComplexityClassifier",
 ]