From ba103ed3faf5b9dca3a0a91bacfde11438d9875c Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Tue, 4 Feb 2025 12:54:08 -0800
Subject: [PATCH 1/4] Add improved cleaning features

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 nemo_curator/modifiers/__init__.py           |   4 +
 nemo_curator/modifiers/newline_normalizer.py |  33 ++++
 nemo_curator/modifiers/url_remover.py        |  30 ++++
 tests/test_cleaning.py                       | 151 +++++++++++++++++++
 tests/test_unicode_reformatter.py            |  59 --------
 5 files changed, 218 insertions(+), 59 deletions(-)
 create mode 100644 nemo_curator/modifiers/newline_normalizer.py
 create mode 100644 nemo_curator/modifiers/url_remover.py
 create mode 100644 tests/test_cleaning.py
 delete mode 100644 tests/test_unicode_reformatter.py

diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py
index f6511fdb..e4b9a62a 100644
--- a/nemo_curator/modifiers/__init__.py
+++ b/nemo_curator/modifiers/__init__.py
@@ -15,8 +15,10 @@
 from .c4 import BoilerPlateStringModifier
 from .doc_modifier import DocumentModifier
 from .fasttext import FastTextLabelModifier
+from .newline_normalizer import NewlineNormalizer
 from .pii_modifier import PiiModifier
 from .unicode_reformatter import UnicodeReformatter
+from .url_remover import UrlRemover
 
 __all__ = [
     "DocumentModifier",
@@ -24,4 +26,6 @@
     "FastTextLabelModifier",
     "UnicodeReformatter",
     "PiiModifier",
+    "NewlineNormalizer",
+    "UrlRemover",
 ]
diff --git a/nemo_curator/modifiers/newline_normalizer.py b/nemo_curator/modifiers/newline_normalizer.py
new file mode 100644
index 00000000..020403c1
--- /dev/null
+++ b/nemo_curator/modifiers/newline_normalizer.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from nemo_curator.modifiers import DocumentModifier
+
+THREE_OR_MORE_NEWLINES_REGEX = re.compile(r"(\n){3,}")
+THREE_OR_MORE_WINDOWS_NEWLINES_REGEX = re.compile(r"(\r\n){3,}")
+
+
+class NewlineNormalizer(DocumentModifier):
+    """
+    Replaces 3 or more consecutive newline characters with only 2 newline characters.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def modify_document(self, text):
+        text = THREE_OR_MORE_NEWLINES_REGEX.sub("\n\n", text)
+        text = THREE_OR_MORE_WINDOWS_NEWLINES_REGEX.sub("\r\n\r\n", text)
+        return text
diff --git a/nemo_curator/modifiers/url_remover.py b/nemo_curator/modifiers/url_remover.py
new file mode 100644
index 00000000..8c4fa9f2
--- /dev/null
+++ b/nemo_curator/modifiers/url_remover.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from nemo_curator.modifiers import DocumentModifier
+
+URL_REGEX = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
+
+
+class UrlRemover(DocumentModifier):
+    """
+    Removes all urls in a document.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def modify_document(self, text):
+        return URL_REGEX.sub("", text)
diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py
new file mode 100644
index 00000000..30152a4b
--- /dev/null
+++ b/tests/test_cleaning.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dask.dataframe as dd
+import pandas as pd
+
+from nemo_curator import Modify
+from nemo_curator.datasets import DocumentDataset
+from nemo_curator.modifiers import NewlineNormalizer, UnicodeReformatter, UrlRemover
+
+
+def list_to_dataset(documents, col_name="text", npartitions=2):
+    data = {col_name: documents}
+    pdf = pd.DataFrame(data)
+
+    return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions))
+
+
+class TestUnicodeReformatter:
+    def test_reformatting(self):
+        # Examples taken from ftfy documentation:
+        # https://ftfy.readthedocs.io/en/latest/
+        dataset = list_to_dataset(
+            [
+                "âœ” No problems",
+                "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.",
+                "l’humanitÃ©",
+                "Ã perturber la rÃ©flexion",
+                "Clean document already.",
+            ]
+        )
+        expected_results = [
+            "✔ No problems",
+            "The Mona Lisa doesn't have eyebrows.",
+            "l'humanité",
+            "à perturber la réflexion",
+            "Clean document already.",
+        ]
+        expected_results.sort()
+
+        modifier = Modify(UnicodeReformatter())
+        fixed_dataset = modifier(dataset)
+        actual_results = fixed_dataset.df.compute()["text"].to_list()
+        actual_results.sort()
+
+        assert (
+            expected_results == actual_results
+        ), f"Expected: {expected_results}, but got: {actual_results}"
+
+
+class TestNewlineNormalizer:
+    def test_just_newlines(self):
+        dataset = list_to_dataset(
+            [
+                "The quick brown fox jumps over the lazy dog",
+                "The quick\nbrown fox jumps \nover the lazy dog",
+                "The quick\n\nbrown fox jumps \n\nover the lazy dog",
+                "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog",
+                "The quick\n\n\nbrown fox jumps \nover the lazy dog",
+            ]
+        )
+        expected_results = [
+            "The quick brown fox jumps over the lazy dog",
+            "The quick\nbrown fox jumps \nover the lazy dog",
+            "The quick\n\nbrown fox jumps \n\nover the lazy dog",
+            "The quick\n\nbrown fox jumps \n\nover the lazy dog",
+            "The quick\n\nbrown fox jumps \nover the lazy dog",
+        ]
+        expected_results.sort()
+
+        modifier = Modify(NewlineNormalizer())
+        fixed_dataset = modifier(dataset)
+        actual_results = fixed_dataset.df.compute()["text"].to_list()
+        actual_results.sort()
+
+        assert (
+            expected_results == actual_results
+        ), f"Expected: {expected_results}, but got: {actual_results}"
+
+    def test_newlines_and_carriage_returns(self):
+        dataset = list_to_dataset(
+            [
+                "The quick brown fox jumps over the lazy dog",
+                "The quick\nbrown fox jumps \nover the lazy dog",
+                "The quick\n\nbrown fox jumps \n\nover the lazy dog",
+                "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog",
+                "The quick\n\n\nbrown fox jumps \nover the lazy dog",
+            ]
+        )
+        expected_results = [
+            "The quick brown fox jumps over the lazy dog",
+            "The quick\r\nbrown fox jumps \r\nover the lazy dog",
+            "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog",
+            "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog",
+            "The quick\r\n\r\nbrown fox jumps \r\nover the lazy dog",
+        ]
+        expected_results.sort()
+
+        modifier = Modify(NewlineNormalizer())
+        fixed_dataset = modifier(dataset)
+        actual_results = fixed_dataset.df.compute()["text"].to_list()
+        actual_results.sort()
+
+        assert (
+            expected_results == actual_results
+        ), f"Expected: {expected_results}, but got: {actual_results}"
+
+
+class TestUrlRemover:
+    def test_urls(self):
+        dataset = list_to_dataset(
+            [
+                "This is a url: www.nvidia.com",
+                "This is a url: http://www.nvidia.com",
+                "This is a url: https://www.nvidia.com",
+                "This is a url: https://www.nvidia.gov",
+                "This is a url: https://nvidia.com",
+                "This is a url: HTTPS://WWW.NVIDIA.COM",
+                "This is not a url: git@github.com:NVIDIA/NeMo-Curator.git",
+            ]
+        )
+        expected_results = [
+            "This is a url: ",
+            "This is a url: ",
+            "This is a url: ",
+            "This is a url: ",
+            "This is a url: ",
+            "This is a url: ",
+            "This is not a url: git@github.com:NVIDIA/NeMo-Curator.git",
+        ]
+        expected_results.sort()
+
+        modifier = Modify(UrlRemover())
+        fixed_dataset = modifier(dataset)
+        actual_results = fixed_dataset.df.compute()["text"].to_list()
+        actual_results.sort()
+
+        assert (
+            expected_results == actual_results
+        ), f"Expected: {expected_results}, but got: {actual_results}"
diff --git a/tests/test_unicode_reformatter.py b/tests/test_unicode_reformatter.py
deleted file mode 100644
index 01ac716b..00000000
--- a/tests/test_unicode_reformatter.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dask.dataframe as dd
-import pandas as pd
-
-import nemo_curator
-from nemo_curator.datasets import DocumentDataset
-from nemo_curator.modifiers import UnicodeReformatter
-
-
-def list_to_dataset(documents, col_name="text", npartitions=2):
-    data = {col_name: documents}
-    pdf = pd.DataFrame(data)
-
-    return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions))
-
-
-class TestUnicodeReformatter:
-    def test_reformatting(self):
-        # Examples taken from ftfy documentation:
-        # https://ftfy.readthedocs.io/en/latest/
-        dataset = list_to_dataset(
-            [
-                "âœ” No problems",
-                "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.",
-                "l’humanitÃ©",
-                "Ã perturber la rÃ©flexion",
-                "Clean document already.",
-            ]
-        )
-        expected_results = [
-            "✔ No problems",
-            "The Mona Lisa doesn't have eyebrows.",
-            "l'humanité",
-            "à perturber la réflexion",
-            "Clean document already.",
-        ]
-        expected_results.sort()
-
-        modifier = nemo_curator.Modify(UnicodeReformatter())
-        fixed_dataset = modifier(dataset)
-        actual_results = fixed_dataset.df.compute()["text"].to_list()
-        actual_results.sort()
-
-        assert (
-            expected_results == actual_results
-        ), f"Expected: {expected_results}, but got: {actual_results}"

From 46767931a25f87f99fcac9f06fbda7d7149d6bba Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Tue, 4 Feb 2025 13:01:24 -0800
Subject: [PATCH 2/4] Fix cleaning tests

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 tests/test_cleaning.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py
index 30152a4b..906da391 100644
--- a/tests/test_cleaning.py
+++ b/tests/test_cleaning.py
@@ -92,17 +92,17 @@ def test_newlines_and_carriage_returns(self):
         dataset = list_to_dataset(
             [
                 "The quick brown fox jumps over the lazy dog",
-                "The quick\nbrown fox jumps \nover the lazy dog",
-                "The quick\n\nbrown fox jumps \n\nover the lazy dog",
-                "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog",
-                "The quick\n\n\nbrown fox jumps \nover the lazy dog",
+                "The quick\r\nbrown fox jumps \r\nover the lazy dog",
+                "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog",
+                "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog",
+                "The quick\r\n\r\n\r\nbrown fox jumps \r\nover the lazy dog",
             ]
         )
         expected_results = [
             "The quick brown fox jumps over the lazy dog",
             "The quick\r\nbrown fox jumps \r\nover the lazy dog",
             "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog",
-            "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog",
+            "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog",
             "The quick\r\n\r\nbrown fox jumps \r\nover the lazy dog",
         ]
         expected_results.sort()

From ddc4cb3c65139b072db352ba0ad8095abff0d0f2 Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Tue, 4 Feb 2025 14:32:03 -0800
Subject: [PATCH 3/4] Update documentation and CLI scripts

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 README.md                                     |  4 +-
 docs/user-guide/index.rst                     |  7 +-
 ...matting.rst => languageidentification.rst} | 40 +-------
 docs/user-guide/text-curation.rst             | 10 +-
 docs/user-guide/textcleaning.rst              | 95 +++++++++++++++++++
 ...d_fix_unicode.py => identify_languages.py} | 19 +---
 nemo_curator/scripts/text_cleaning.py         | 24 ++++-
 7 files changed, 133 insertions(+), 66 deletions(-)
 rename docs/user-guide/{languageidentificationunicodeformatting.rst => languageidentification.rst} (60%)
 create mode 100644 docs/user-guide/textcleaning.rst
 rename examples/{identify_languages_and_fix_unicode.py => identify_languages.py} (79%)

diff --git a/README.md b/README.md
index d52129f4..77b32836 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,8 @@ All of our text pipelines have great multilingual support.
 - [Download and Extraction](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/download.html)
   - Default implementations for Common Crawl, Wikipedia, and ArXiv sources
   - Easily customize and extend to other sources
-- [Language Identification](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentificationunicodeformatting.html)
-- [Unicode Reformatting](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentificationunicodeformatting.html)
+- [Language Identification](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentification.html)
+- [Text Cleaning](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/textcleaning.html)
 - [Heuristic Filtering](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/qualityfiltering.html)
 - Classifier Filtering
   - [fastText](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/qualityfiltering.html)
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index a9c589ac..ad8eb68c 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -16,8 +16,11 @@ Text Curation
 :ref:`Document Filtering <data-curator-qualityfiltering>`
    This section describes how to use the 30+ heuristic and classifier filters available within the NeMo Curator and implement custom filters to apply to the documents within the corpora.
 
-:ref:`Language Identification and Unicode Fixing <data-curator-languageidentification>`
-   Large, unlabeled text corpora often contain a variety of languages. The NeMo Curator provides utilities to identify languages and fix improperly decoded Unicode characters.
+:ref:`Language Identification <data-curator-languageidentification>`
+   Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages.
+
+:ref:`Text Cleaning <data-curator-text-cleaning>`
+   Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
 
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
diff --git a/docs/user-guide/languageidentificationunicodeformatting.rst b/docs/user-guide/languageidentification.rst
similarity index 60%
rename from docs/user-guide/languageidentificationunicodeformatting.rst
rename to docs/user-guide/languageidentification.rst
index 3e61f8f7..561f14c3 100644
--- a/docs/user-guide/languageidentificationunicodeformatting.rst
+++ b/docs/user-guide/languageidentification.rst
@@ -11,10 +11,8 @@ Background
 Large unlabeled text corpora often contain a variety of languages.
 However, data curation usually includes steps that are language specific (e.g. using language-tuned heuristics for quality filtering)
 and many curators are only interested in curating a monolingual dataset.
-Datasets also may have improperly decoded unicode characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows.").
 
-NeMo Curator provides utilities to identify languages and fix improperly decoded unicode characters.
-The language identification is performed using `fastText <https://fasttext.cc/docs/en/language-identification.html>`_ and unicode fixing is performed using `ftfy <https://ftfy.readthedocs.io/en/latest/>`_.
+NeMo Curator provides utilities to identify languages using `fastText <https://fasttext.cc/docs/en/language-identification.html>`_.
 Even though a preliminary language identification may have been performed on the unextracted text (as is the case in our Common Crawl pipeline
 using pyCLD2), `fastText <https://fasttext.cc/docs/en/language-identification.html>`_ is more accurate so it can be used for a second pass.
 
@@ -22,29 +20,8 @@ using pyCLD2), `fastText <https://fasttext.cc/docs/en/language-identification.ht
 Usage
 -----------------------------------------
 
-We provide an example of how to use the language identification and unicode reformatting utility at ``examples/identify_languages_and_fix_unicode.py``.
+We provide an example of how to use the language identification and unicode reformatting utility at ``examples/identify_languages.py``.
 At a high level, the module first identifies the languages of the documents and removes any documents for which it has high uncertainty about the language.
-Notably, this line uses one of the ``DocmentModifiers`` that NeMo Curator provides:
-
-.. code-block:: python
-
-  cleaner = nc.Modify(UnicodeReformatter())
-  cleaned_data = cleaner(lang_data)
-
-``DocumentModifier``s like ``UnicodeReformatter`` are very similar to ``DocumentFilter``s.
-They implement a single ``modify_document`` function that takes in a document and outputs a modified document.
-Here is the implementation of the ``UnicodeReformatter`` modifier:
-
-.. code-block:: python
-
-  class UnicodeReformatter(DocumentModifier):
-      def __init__(self):
-          super().__init__()
-
-      def modify_document(self, text: str) -> str:
-          return ftfy.fix_text(text)
-
-Also like the ``DocumentFilter`` functions, ``modify_document`` can be annotated with ``batched`` to take in a pandas series of documents instead of a single document.
 
 -----------------------------------------
 Related Scripts
@@ -79,15 +56,4 @@ within that file. Below is an example run command for :code:`separate_by_metadat
      --output-metadata-distribution=./data/lang_distro.json
 
 After running this module, the output directory will consist of one directory per language present within the corpus and all documents
-within those directories will contain text that originates from the same language. Finally, the text within a specific language can have
-its unicode fixed using the :code:`text_cleaning` module
-
-.. code-block:: bash
-
-    text_cleaning \
-      --input-data-dir=<Output directory containing sub-directories>/EN \
-      --output-clean-dir=<Output directory to which cleaned english documents will be written>
-
-
-The above :code:`text_cleaning` module uses the heuristics defined within the :code:`ftfy` package that is commonly used for fixing
-improperly decoded unicode.
+within those directories will contain text that originates from the same language.
diff --git a/docs/user-guide/text-curation.rst b/docs/user-guide/text-curation.rst
index 4d2e1ddb..c015d4c8 100644
--- a/docs/user-guide/text-curation.rst
+++ b/docs/user-guide/text-curation.rst
@@ -13,8 +13,11 @@ Text Curation
 :ref:`Document Filtering <data-curator-qualityfiltering>`
    This section describes how to use the 30+ heuristic and classifier filters available within the NeMo Curator and implement custom filters to apply to the documents within the corpora.
 
-:ref:`Language Identification and Unicode Fixing <data-curator-languageidentification>`
-   Large, unlabeled text corpora often contain a variety of languages. The NeMo Curator provides utilities to identify languages and fix improperly decoded Unicode characters.
+:ref:`Language Identification <data-curator-languageidentification>`
+   Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages.
+
+:ref:`Text Cleaning <data-curator-text-cleaning>`
+   Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
 
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
@@ -43,7 +46,8 @@ Text Curation
    documentdataset.rst
    cpuvsgpu.rst
    qualityfiltering.rst
-   languageidentificationunicodeformatting.rst
+   languageidentification.rst
+   textcleaning.rst
    gpudeduplication.rst
    semdedup.rst
    syntheticdata.rst
diff --git a/docs/user-guide/textcleaning.rst b/docs/user-guide/textcleaning.rst
new file mode 100644
index 00000000..9bb022ee
--- /dev/null
+++ b/docs/user-guide/textcleaning.rst
@@ -0,0 +1,95 @@
+.. _data-curator-text-cleaning:
+
+=========================
+Text Cleaning
+=========================
+
+--------------------
+Overview
+--------------------
+Documents in datasets may contain improperly decoded characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows."), inconsistent line spacing, and many urls.
+NeMo Curator provides a few modules that can help remove undesirable text from within individual documents.
+
+--------------------
+Use Cases
+--------------------
+* Fixing improperly decoded unicode characters from webpages.
+* Standardizing document layout by removing excessive newlines.
+* Removing URLs in documents.
+
+--------------------
+Modules
+--------------------
+NeMo Curator provides a collection of easy to use modules for cleaning text.
+
+.. code-block:: python
+
+    from nemo_curator import Sequential, Modify, get_client
+    from nemo_curator.datasets import DocumentDataset
+    from nemo_curator.modifiers import UnicodeReformatter, UrlRemover, NewlineNormalizer
+
+    def main():
+        client = get_client(cluster_type="cpu")
+
+        dataset = DocumentDataset.read_json("books.jsonl")
+        cleaning_pipeline = Sequential([
+            Modify(UnicodeReformatter()),
+            Modify(NewlineNormalizer()),
+            Modify(UrlRemover()),
+        ])
+
+        cleaned_dataset = cleaning_pipeline(dataset)
+
+        cleaned_dataset.to_json("cleaned_books.jsonl")
+
+    if __name__ == "__main__":
+        main()
+
+Here, we load a dataset and perform all of the cleaning operations that NeMo Curator supports.
+* ``Modify(UnicodeReformatter())``: Uses `ftfy <https://ftfy.readthedocs.io/en/latest/>`_ to fix broken Unicode characters. Modifies the `"text"` field of the datset by default. This can be changed by setting ``Modify(UnicodeReformatter(), text_field="my_field")``.
+* ``Modify(NewlineNormalizer())``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters.
+* ``Modify(UrlRemover())``: Uses regex to remove all urls in each document
+
+Any subset of these steps can be run at a time.
+
+Additionally, NeMo Curator has the ``text_cleaning`` CLI command that can perform the same functions:
+
+.. code-block:: bash
+
+    text_cleaning \
+      --input-data-dir=/path/to/input/ \
+      --output-clean-dir=/path/to/output/ \
+      --normalize-newlines \
+      --remove-urls
+
+By default, the CLI will only perform unicode reformatting. Adding the ``--normalize-newlines`` and ``--remove-urls`` options add the other text cleaning options.
+
+------------------------
+Custom Text Cleaner
+------------------------
+It's easy to write your own custom text cleaner. The implementation of ``UnicodeReformatter`` can be used as an example.
+
+.. code-block:: python
+    import ftfy
+
+    from nemo_curator.modifiers import DocumentModifier
+
+
+    class UnicodeReformatter(DocumentModifier):
+        def __init__(self):
+            super().__init__()
+
+        def modify_document(self, text: str) -> str:
+            return ftfy.fix_text(text)
+
+Simply define a new class that inherits from ``DocumentModifier`` and define the constructor and ``modify_text`` method.
+Also, like the ``DocumentFilter`` class, ``modify_document`` can be annotated with ``batched`` to take in a pandas series of documents instead of a single document.
+See the :ref:`document filtering page <data-curator-qualityfiltering>` for more information.
+
+---------------------------
+Additional Resources
+---------------------------
+* `Single GPU Tutorial <https://github.com/NVIDIA/NeMo-Curator/blob/main/tutorials/single_node_tutorial/single_gpu_tutorial.ipynb>`_
+* `ftfy <https://ftfy.readthedocs.io/en/latest/>`_
+* `Refined Web Paper <https://arxiv.org/abs/2306.01116>`_
+* `Nemotron-CC Paper <https://arxiv.org/abs/2412.02595>`_
\ No newline at end of file
diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages.py
similarity index 79%
rename from examples/identify_languages_and_fix_unicode.py
rename to examples/identify_languages.py
index 92f628e3..2a090da0 100644
--- a/examples/identify_languages_and_fix_unicode.py
+++ b/examples/identify_languages.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,13 +13,11 @@
 # limitations under the License.
 
 import argparse
-import os
 
 import nemo_curator as nc
 from nemo_curator.datasets import DocumentDataset
 from nemo_curator.filters import FastTextLangId
-from nemo_curator.modifiers import UnicodeReformatter
-from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
+from nemo_curator.utils.distributed_utils import get_client, read_data
 from nemo_curator.utils.file_utils import (
     get_all_files_paths_under,
     separate_by_metadata,
@@ -45,7 +43,6 @@ def main(args):
     # and see a list of supported languages here:
     # https://fasttext.cc/docs/en/language-identification.html
     model_path = "/path/to/model.bin"
-    target_language = "EN"
     language_field = "language"
 
     # Prepare samples for the classifier
@@ -70,18 +67,6 @@ def main(args):
         metadata_field=language_field,
     ).compute()
 
-    # Read the language specific data and fix the unicode in it
-    lang_data_path = os.path.join(language_separated_output_path, target_language)
-    if not os.path.exists(lang_data_path):
-        raise RuntimeError(f"Dataset did not have language: {target_language}")
-    lang_data = load_dataset(lang_data_path)
-
-    cleaner = nc.Modify(UnicodeReformatter())
-    cleaned_data = cleaner(lang_data)
-
-    # Write the cleaned_data
-    write_to_disk(cleaned_data.df, cleaned_data_output_path, write_to_filename=True)
-
 
 def attach_args(
     parser=argparse.ArgumentParser(
diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py
index f05a3843..762a063c 100644
--- a/nemo_curator/scripts/text_cleaning.py
+++ b/nemo_curator/scripts/text_cleaning.py
@@ -14,9 +14,9 @@
 
 import argparse
 
-import nemo_curator
+from nemo_curator import Modify, Sequential
 from nemo_curator.datasets import DocumentDataset
-from nemo_curator.modifiers import UnicodeReformatter
+from nemo_curator.modifiers import NewlineNormalizer, UnicodeReformatter, UrlRemover
 from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk
 from nemo_curator.utils.file_utils import expand_outdir_and_mkdir, get_batched_files
 from nemo_curator.utils.script_utils import ArgumentHelper
@@ -28,9 +28,14 @@ def main(args):
     # Make the output directories
     output_clean_dir = expand_outdir_and_mkdir(args.output_clean_dir)
 
-    cleaner = nemo_curator.Modify(
-        UnicodeReformatter(), text_field=args.input_text_field
-    )
+    stages = [Modify(UnicodeReformatter(), text_field=args.input_text_field)]
+
+    if args.normalize_newlines:
+        stages.append(Modify(NewlineNormalizer(), text_field=args.input_text_field))
+    if args.remove_urls:
+        stages.append(Modify(UrlRemover, text_field=args.text_field))
+
+    cleaner = Sequential(stages)
 
     for files in get_batched_files(
         args.input_data_dir,
@@ -79,6 +84,15 @@ def attach_args(
     argumentHelper.add_arg_input_text_field()
     argumentHelper.add_arg_output_file_type()
     argumentHelper.add_distributed_args()
+    argumentHelper.attach_bool_arg(
+        parser,
+        "normalize-newlines",
+        default=False,
+        help="Replace 3 or more consecutive newline characters in each document with only 2 newline characters.",
+    )
+    argumentHelper.attach_bool_arg(
+        parser, "remove-urls", default=False, help="Removes all urls in each document."
+    )
     parser.add_argument(
         "--output-clean-dir",
         type=str,

From abcf93f6f259e28a8fe6555751a41278dea3abaa Mon Sep 17 00:00:00 2001
From: Ryan Wolf <rywolf@nvidia.com>
Date: Wed, 5 Feb 2025 09:26:16 -0800
Subject: [PATCH 4/4] Address Sarah and Lawrence's reviews

Signed-off-by: Ryan Wolf <rywolf@nvidia.com>
---
 docs/user-guide/index.rst                     |  2 +-
 .../{textcleaning.rst => text-cleaning.rst}   | 31 ++++++++++---------
 docs/user-guide/text-curation.rst             |  2 +-
 examples/README.md                            |  2 +-
 nemo_curator/modifiers/url_remover.py         |  2 +-
 nemo_curator/scripts/text_cleaning.py         |  2 +-
 6 files changed, 22 insertions(+), 19 deletions(-)
 rename docs/user-guide/{textcleaning.rst => text-cleaning.rst} (60%)

diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index ad8eb68c..b63e1b93 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -20,7 +20,7 @@ Text Curation
    Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages.
 
 :ref:`Text Cleaning <data-curator-text-cleaning>`
-   Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
+   Many parts of the Internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
 
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
diff --git a/docs/user-guide/textcleaning.rst b/docs/user-guide/text-cleaning.rst
similarity index 60%
rename from docs/user-guide/textcleaning.rst
rename to docs/user-guide/text-cleaning.rst
index 9bb022ee..b9ffaa2f 100644
--- a/docs/user-guide/textcleaning.rst
+++ b/docs/user-guide/text-cleaning.rst
@@ -7,20 +7,30 @@ Text Cleaning
 --------------------
 Overview
 --------------------
-Documents in datasets may contain improperly decoded characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows."), inconsistent line spacing, and many urls.
-NeMo Curator provides a few modules that can help remove undesirable text from within individual documents.
+Use NeMo Curator's text cleaning modules to remove undesirable text such as improperly decoded unicode characters, inconsistent line spacing, or excessive URLs from documents being pre-processed for dataset.
+
+For example, the input sentence `"The Mona Lisa doesn't have eyebrows."` from a given document may not have included a properly encoded apostrophe (`'`), resulting in the sentence decoding as `"The Mona Lisa doesnÃƒÂ¢Ã¢â€šÂ¬Ã¢â€žÂ¢t have eyebrows."` NeMo Curator enables you to easily run this document through the default `UnicodeReformatter()` module to detect and remove the unwanted text,  or you can define your own custom unicode text cleaner tailored to your needs.
 
 --------------------
 Use Cases
 --------------------
-* Fixing improperly decoded unicode characters from webpages.
-* Standardizing document layout by removing excessive newlines.
-* Removing URLs in documents.
+* Fix improperly decoded Unicode characters from webpages.
+* Standardize document layout by removing excessive newlines.
+* Remove URLs in documents.
 
 --------------------
 Modules
 --------------------
-NeMo Curator provides a collection of easy to use modules for cleaning text.
+NeMo Curator provides the following modules for cleaning text:
+
+- ``UnicodeReformatter()``: Uses [ftfy](https://ftfy.readthedocs.io/en/latest/) to fix broken Unicode characters. Modifies the "text" field of the dataset by default.
+- ``NewlineNormalizer()``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters.
+- ``UrlRemover()``: Uses regex to remove all urls in each document.
+
+You can use these modules individually or sequentially in a cleaning pipeline.
+
+Consider the following example, which loads a dataset (`books.jsonl`), steps through each module in a cleaning pipeline, and outputs the processed dataset as `cleaned_books.jsonl`:
+
 
 .. code-block:: python
 
@@ -45,14 +55,7 @@ NeMo Curator provides a collection of easy to use modules for cleaning text.
     if __name__ == "__main__":
         main()
 
-Here, we load a dataset and perform all of the cleaning operations that NeMo Curator supports.
-* ``Modify(UnicodeReformatter())``: Uses `ftfy <https://ftfy.readthedocs.io/en/latest/>`_ to fix broken Unicode characters. Modifies the `"text"` field of the datset by default. This can be changed by setting ``Modify(UnicodeReformatter(), text_field="my_field")``.
-* ``Modify(NewlineNormalizer())``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters.
-* ``Modify(UrlRemover())``: Uses regex to remove all urls in each document
-
-Any subset of these steps can be run at a time.
-
-Additionally, NeMo Curator has the ``text_cleaning`` CLI command that can perform the same functions:
+You can also perform text cleaning operations using the CLI by running the `text_cleaning` command:
 
 .. code-block:: bash
 
diff --git a/docs/user-guide/text-curation.rst b/docs/user-guide/text-curation.rst
index c015d4c8..a4cc83b0 100644
--- a/docs/user-guide/text-curation.rst
+++ b/docs/user-guide/text-curation.rst
@@ -17,7 +17,7 @@ Text Curation
    Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages.
 
 :ref:`Text Cleaning <data-curator-text-cleaning>`
-   Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
+   Many parts of the Internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text.
 
 :ref:`GPU Accelerated Exact and Fuzzy Deduplication <data-curator-gpu-deduplication>`
    Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF.
diff --git a/examples/README.md b/examples/README.md
index 3e101a1e..29545978 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -14,7 +14,7 @@ These include:
 | exact_deduplication.py                | Use the `ExactDuplicates` class to perform exact deduplication on text data.                                  |
 | find_pii_and_deidentify.py            | Use the `PiiModifier` and `Modify` classes to remove personally identifiable information from text data.      |
 | fuzzy_deduplication.py                | Use the `FuzzyDuplicatesConfig` and `FuzzyDuplicates` classes to perform fuzzy deduplication on text data.    |
-| identify_languages_and_fix_unicode.py | Use `FastTextLangId` to filter data by language, then fix the unicode in it.                                  |
+| identify_languages.py                 | Use `FastTextLangId` to filter data by language                                                               |
 | raw_download_common_crawl.py          | Download the raw compressed WARC files from Common Crawl without extracting them.                             |
 | semdedup_example.py                   | Use the `SemDedup` class to perform semantic deduplication on text data.                                      |
 | task_decontamination.py               | Remove segments of downstream evaluation tasks from a dataset.                                                |
diff --git a/nemo_curator/modifiers/url_remover.py b/nemo_curator/modifiers/url_remover.py
index 8c4fa9f2..85ebe4b6 100644
--- a/nemo_curator/modifiers/url_remover.py
+++ b/nemo_curator/modifiers/url_remover.py
@@ -20,7 +20,7 @@
 
 class UrlRemover(DocumentModifier):
     """
-    Removes all urls in a document.
+    Removes all URLs in a document.
     """
 
     def __init__(self):
diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py
index 762a063c..87d99099 100644
--- a/nemo_curator/scripts/text_cleaning.py
+++ b/nemo_curator/scripts/text_cleaning.py
@@ -91,7 +91,7 @@ def attach_args(
         help="Replace 3 or more consecutive newline characters in each document with only 2 newline characters.",
     )
     argumentHelper.attach_bool_arg(
-        parser, "remove-urls", default=False, help="Removes all urls in each document."
+        parser, "remove-urls", default=False, help="Removes all URLs in each document."
     )
     parser.add_argument(
         "--output-clean-dir",