From ba103ed3faf5b9dca3a0a91bacfde11438d9875c Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 4 Feb 2025 12:54:08 -0800 Subject: [PATCH 1/4] Add improved cleaning features Signed-off-by: Ryan Wolf --- nemo_curator/modifiers/__init__.py | 4 + nemo_curator/modifiers/newline_normalizer.py | 33 ++++ nemo_curator/modifiers/url_remover.py | 30 ++++ tests/test_cleaning.py | 151 +++++++++++++++++++ tests/test_unicode_reformatter.py | 59 -------- 5 files changed, 218 insertions(+), 59 deletions(-) create mode 100644 nemo_curator/modifiers/newline_normalizer.py create mode 100644 nemo_curator/modifiers/url_remover.py create mode 100644 tests/test_cleaning.py delete mode 100644 tests/test_unicode_reformatter.py diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py index f6511fdb..e4b9a62a 100644 --- a/nemo_curator/modifiers/__init__.py +++ b/nemo_curator/modifiers/__init__.py @@ -15,8 +15,10 @@ from .c4 import BoilerPlateStringModifier from .doc_modifier import DocumentModifier from .fasttext import FastTextLabelModifier +from .newline_normalizer import NewlineNormalizer from .pii_modifier import PiiModifier from .unicode_reformatter import UnicodeReformatter +from .url_remover import UrlRemover __all__ = [ "DocumentModifier", @@ -24,4 +26,6 @@ "FastTextLabelModifier", "UnicodeReformatter", "PiiModifier", + "NewlineNormalizer", + "UrlRemover", ] diff --git a/nemo_curator/modifiers/newline_normalizer.py b/nemo_curator/modifiers/newline_normalizer.py new file mode 100644 index 00000000..020403c1 --- /dev/null +++ b/nemo_curator/modifiers/newline_normalizer.py @@ -0,0 +1,33 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from nemo_curator.modifiers import DocumentModifier + +THREE_OR_MORE_NEWLINES_REGEX = re.compile(r"(\n){3,}") +THREE_OR_MORE_WINDOWS_NEWLINES_REGEX = re.compile(r"(\r\n){3,}") + + +class NewlineNormalizer(DocumentModifier): + """ + Replaces 3 or more consecutive newline characters with only 2 newline characters. + """ + + def __init__(self): + super().__init__() + + def modify_document(self, text): + text = THREE_OR_MORE_NEWLINES_REGEX.sub("\n\n", text) + text = THREE_OR_MORE_WINDOWS_NEWLINES_REGEX.sub("\r\n\r\n", text) + return text diff --git a/nemo_curator/modifiers/url_remover.py b/nemo_curator/modifiers/url_remover.py new file mode 100644 index 00000000..8c4fa9f2 --- /dev/null +++ b/nemo_curator/modifiers/url_remover.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from nemo_curator.modifiers import DocumentModifier + +URL_REGEX = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE) + + +class UrlRemover(DocumentModifier): + """ + Removes all urls in a document. + """ + + def __init__(self): + super().__init__() + + def modify_document(self, text): + return URL_REGEX.sub("", text) diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py new file mode 100644 index 00000000..30152a4b --- /dev/null +++ b/tests/test_cleaning.py @@ -0,0 +1,151 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dask.dataframe as dd +import pandas as pd + +from nemo_curator import Modify +from nemo_curator.datasets import DocumentDataset +from nemo_curator.modifiers import NewlineNormalizer, UnicodeReformatter, UrlRemover + + +def list_to_dataset(documents, col_name="text", npartitions=2): + data = {col_name: documents} + pdf = pd.DataFrame(data) + + return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) + + +class TestUnicodeReformatter: + def test_reformatting(self): + # Examples taken from ftfy documentation: + # https://ftfy.readthedocs.io/en/latest/ + dataset = list_to_dataset( + [ + "✔ No problems", + "The Mona Lisa doesn’t have eyebrows.", + "l’humanité", + "à perturber la réflexion", + "Clean document already.", + ] + ) + expected_results = [ + "✔ No problems", + "The Mona Lisa doesn't have eyebrows.", + "l'humanité", + "à perturber la réflexion", + "Clean document already.", + ] + expected_results.sort() + + modifier = Modify(UnicodeReformatter()) + fixed_dataset = modifier(dataset) + actual_results = fixed_dataset.df.compute()["text"].to_list() + actual_results.sort() + + assert ( + expected_results == actual_results + ), f"Expected: {expected_results}, but got: {actual_results}" + + +class TestNewlineNormalizer: + def test_just_newlines(self): + dataset = list_to_dataset( + [ + "The quick brown fox jumps over the lazy dog", + "The quick\nbrown fox jumps \nover the lazy dog", + "The quick\n\nbrown fox jumps \n\nover the lazy dog", + "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog", + "The quick\n\n\nbrown fox jumps \nover the lazy dog", + ] + ) + expected_results = [ + "The quick brown fox jumps over the lazy dog", + "The quick\nbrown fox jumps \nover the lazy dog", + "The quick\n\nbrown fox jumps \n\nover the lazy dog", + "The quick\n\nbrown fox jumps \n\nover the lazy dog", + "The quick\n\nbrown fox jumps \nover the lazy dog", + ] + expected_results.sort() + + modifier = Modify(NewlineNormalizer()) + fixed_dataset = modifier(dataset) + actual_results = fixed_dataset.df.compute()["text"].to_list() + actual_results.sort() + + assert ( + expected_results == actual_results + ), f"Expected: {expected_results}, but got: {actual_results}" + + def test_newlines_and_carriage_returns(self): + dataset = list_to_dataset( + [ + "The quick brown fox jumps over the lazy dog", + "The quick\nbrown fox jumps \nover the lazy dog", + "The quick\n\nbrown fox jumps \n\nover the lazy dog", + "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog", + "The quick\n\n\nbrown fox jumps \nover the lazy dog", + ] + ) + expected_results = [ + "The quick brown fox jumps over the lazy dog", + "The quick\r\nbrown fox jumps \r\nover the lazy dog", + "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog", + "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog", + "The quick\r\n\r\nbrown fox jumps \r\nover the lazy dog", + ] + expected_results.sort() + + modifier = Modify(NewlineNormalizer()) + fixed_dataset = modifier(dataset) + actual_results = fixed_dataset.df.compute()["text"].to_list() + actual_results.sort() + + assert ( + expected_results == actual_results + ), f"Expected: {expected_results}, but got: {actual_results}" + + +class TestUrlRemover: + def test_urls(self): + dataset = list_to_dataset( + [ + "This is a url: www.nvidia.com", + "This is a url: http://www.nvidia.com", + "This is a url: https://www.nvidia.com", + "This is a url: https://www.nvidia.gov", + "This is a url: https://nvidia.com", + "This is a url: HTTPS://WWW.NVIDIA.COM", + "This is not a url: git@github.com:NVIDIA/NeMo-Curator.git", + ] + ) + expected_results = [ + "This is a url: ", + "This is a url: ", + "This is a url: ", + "This is a url: ", + "This is a url: ", + "This is a url: ", + "This is not a url: git@github.com:NVIDIA/NeMo-Curator.git", + ] + expected_results.sort() + + modifier = Modify(UrlRemover()) + fixed_dataset = modifier(dataset) + actual_results = fixed_dataset.df.compute()["text"].to_list() + actual_results.sort() + + assert ( + expected_results == actual_results + ), f"Expected: {expected_results}, but got: {actual_results}" diff --git a/tests/test_unicode_reformatter.py b/tests/test_unicode_reformatter.py deleted file mode 100644 index 01ac716b..00000000 --- a/tests/test_unicode_reformatter.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dask.dataframe as dd -import pandas as pd - -import nemo_curator -from nemo_curator.datasets import DocumentDataset -from nemo_curator.modifiers import UnicodeReformatter - - -def list_to_dataset(documents, col_name="text", npartitions=2): - data = {col_name: documents} - pdf = pd.DataFrame(data) - - return DocumentDataset(dd.from_pandas(pdf, npartitions=npartitions)) - - -class TestUnicodeReformatter: - def test_reformatting(self): - # Examples taken from ftfy documentation: - # https://ftfy.readthedocs.io/en/latest/ - dataset = list_to_dataset( - [ - "✔ No problems", - "The Mona Lisa doesn’t have eyebrows.", - "l’humanité", - "à perturber la réflexion", - "Clean document already.", - ] - ) - expected_results = [ - "✔ No problems", - "The Mona Lisa doesn't have eyebrows.", - "l'humanité", - "à perturber la réflexion", - "Clean document already.", - ] - expected_results.sort() - - modifier = nemo_curator.Modify(UnicodeReformatter()) - fixed_dataset = modifier(dataset) - actual_results = fixed_dataset.df.compute()["text"].to_list() - actual_results.sort() - - assert ( - expected_results == actual_results - ), f"Expected: {expected_results}, but got: {actual_results}" From 46767931a25f87f99fcac9f06fbda7d7149d6bba Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 4 Feb 2025 13:01:24 -0800 Subject: [PATCH 2/4] Fix cleaning tests Signed-off-by: Ryan Wolf --- tests/test_cleaning.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_cleaning.py b/tests/test_cleaning.py index 30152a4b..906da391 100644 --- a/tests/test_cleaning.py +++ b/tests/test_cleaning.py @@ -92,17 +92,17 @@ def test_newlines_and_carriage_returns(self): dataset = list_to_dataset( [ "The quick brown fox jumps over the lazy dog", - "The quick\nbrown fox jumps \nover the lazy dog", - "The quick\n\nbrown fox jumps \n\nover the lazy dog", - "The quick\n\n\nbrown fox jumps \n\n\nover the lazy dog", - "The quick\n\n\nbrown fox jumps \nover the lazy dog", + "The quick\r\nbrown fox jumps \r\nover the lazy dog", + "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog", + "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog", + "The quick\r\n\r\n\r\nbrown fox jumps \r\nover the lazy dog", ] ) expected_results = [ "The quick brown fox jumps over the lazy dog", "The quick\r\nbrown fox jumps \r\nover the lazy dog", "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog", - "The quick\r\n\r\n\r\nbrown fox jumps \r\n\r\n\r\nover the lazy dog", + "The quick\r\n\r\nbrown fox jumps \r\n\r\nover the lazy dog", "The quick\r\n\r\nbrown fox jumps \r\nover the lazy dog", ] expected_results.sort() From ddc4cb3c65139b072db352ba0ad8095abff0d0f2 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 4 Feb 2025 14:32:03 -0800 Subject: [PATCH 3/4] Update documentation and CLI scripts Signed-off-by: Ryan Wolf --- README.md | 4 +- docs/user-guide/index.rst | 7 +- ...matting.rst => languageidentification.rst} | 40 +------- docs/user-guide/text-curation.rst | 10 +- docs/user-guide/textcleaning.rst | 95 +++++++++++++++++++ ...d_fix_unicode.py => identify_languages.py} | 19 +--- nemo_curator/scripts/text_cleaning.py | 24 ++++- 7 files changed, 133 insertions(+), 66 deletions(-) rename docs/user-guide/{languageidentificationunicodeformatting.rst => languageidentification.rst} (60%) create mode 100644 docs/user-guide/textcleaning.rst rename examples/{identify_languages_and_fix_unicode.py => identify_languages.py} (79%) diff --git a/README.md b/README.md index d52129f4..77b32836 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,8 @@ All of our text pipelines have great multilingual support. - [Download and Extraction](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/download.html) - Default implementations for Common Crawl, Wikipedia, and ArXiv sources - Easily customize and extend to other sources -- [Language Identification](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentificationunicodeformatting.html) -- [Unicode Reformatting](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentificationunicodeformatting.html) +- [Language Identification](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/languageidentification.html) +- [Text Cleaning](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/textcleaning.html) - [Heuristic Filtering](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/qualityfiltering.html) - Classifier Filtering - [fastText](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/qualityfiltering.html) diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index a9c589ac..ad8eb68c 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -16,8 +16,11 @@ Text Curation :ref:`Document Filtering ` This section describes how to use the 30+ heuristic and classifier filters available within the NeMo Curator and implement custom filters to apply to the documents within the corpora. -:ref:`Language Identification and Unicode Fixing ` - Large, unlabeled text corpora often contain a variety of languages. The NeMo Curator provides utilities to identify languages and fix improperly decoded Unicode characters. +:ref:`Language Identification ` + Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages. + +:ref:`Text Cleaning ` + Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. diff --git a/docs/user-guide/languageidentificationunicodeformatting.rst b/docs/user-guide/languageidentification.rst similarity index 60% rename from docs/user-guide/languageidentificationunicodeformatting.rst rename to docs/user-guide/languageidentification.rst index 3e61f8f7..561f14c3 100644 --- a/docs/user-guide/languageidentificationunicodeformatting.rst +++ b/docs/user-guide/languageidentification.rst @@ -11,10 +11,8 @@ Background Large unlabeled text corpora often contain a variety of languages. However, data curation usually includes steps that are language specific (e.g. using language-tuned heuristics for quality filtering) and many curators are only interested in curating a monolingual dataset. -Datasets also may have improperly decoded unicode characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesn’t have eyebrows."). -NeMo Curator provides utilities to identify languages and fix improperly decoded unicode characters. -The language identification is performed using `fastText `_ and unicode fixing is performed using `ftfy `_. +NeMo Curator provides utilities to identify languages using `fastText `_. Even though a preliminary language identification may have been performed on the unextracted text (as is the case in our Common Crawl pipeline using pyCLD2), `fastText `_ is more accurate so it can be used for a second pass. @@ -22,29 +20,8 @@ using pyCLD2), `fastText str: - return ftfy.fix_text(text) - -Also like the ``DocumentFilter`` functions, ``modify_document`` can be annotated with ``batched`` to take in a pandas series of documents instead of a single document. ----------------------------------------- Related Scripts @@ -79,15 +56,4 @@ within that file. Below is an example run command for :code:`separate_by_metadat --output-metadata-distribution=./data/lang_distro.json After running this module, the output directory will consist of one directory per language present within the corpus and all documents -within those directories will contain text that originates from the same language. Finally, the text within a specific language can have -its unicode fixed using the :code:`text_cleaning` module - -.. code-block:: bash - - text_cleaning \ - --input-data-dir=/EN \ - --output-clean-dir= - - -The above :code:`text_cleaning` module uses the heuristics defined within the :code:`ftfy` package that is commonly used for fixing -improperly decoded unicode. +within those directories will contain text that originates from the same language. diff --git a/docs/user-guide/text-curation.rst b/docs/user-guide/text-curation.rst index 4d2e1ddb..c015d4c8 100644 --- a/docs/user-guide/text-curation.rst +++ b/docs/user-guide/text-curation.rst @@ -13,8 +13,11 @@ Text Curation :ref:`Document Filtering ` This section describes how to use the 30+ heuristic and classifier filters available within the NeMo Curator and implement custom filters to apply to the documents within the corpora. -:ref:`Language Identification and Unicode Fixing ` - Large, unlabeled text corpora often contain a variety of languages. The NeMo Curator provides utilities to identify languages and fix improperly decoded Unicode characters. +:ref:`Language Identification ` + Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages. + +:ref:`Text Cleaning ` + Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. @@ -43,7 +46,8 @@ Text Curation documentdataset.rst cpuvsgpu.rst qualityfiltering.rst - languageidentificationunicodeformatting.rst + languageidentification.rst + textcleaning.rst gpudeduplication.rst semdedup.rst syntheticdata.rst diff --git a/docs/user-guide/textcleaning.rst b/docs/user-guide/textcleaning.rst new file mode 100644 index 00000000..9bb022ee --- /dev/null +++ b/docs/user-guide/textcleaning.rst @@ -0,0 +1,95 @@ +.. _data-curator-text-cleaning: + +========================= +Text Cleaning +========================= + +-------------------- +Overview +-------------------- +Documents in datasets may contain improperly decoded characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesn’t have eyebrows."), inconsistent line spacing, and many urls. +NeMo Curator provides a few modules that can help remove undesirable text from within individual documents. + +-------------------- +Use Cases +-------------------- +* Fixing improperly decoded unicode characters from webpages. +* Standardizing document layout by removing excessive newlines. +* Removing URLs in documents. + +-------------------- +Modules +-------------------- +NeMo Curator provides a collection of easy to use modules for cleaning text. + +.. code-block:: python + + from nemo_curator import Sequential, Modify, get_client + from nemo_curator.datasets import DocumentDataset + from nemo_curator.modifiers import UnicodeReformatter, UrlRemover, NewlineNormalizer + + def main(): + client = get_client(cluster_type="cpu") + + dataset = DocumentDataset.read_json("books.jsonl") + cleaning_pipeline = Sequential([ + Modify(UnicodeReformatter()), + Modify(NewlineNormalizer()), + Modify(UrlRemover()), + ]) + + cleaned_dataset = cleaning_pipeline(dataset) + + cleaned_dataset.to_json("cleaned_books.jsonl") + + if __name__ == "__main__": + main() + +Here, we load a dataset and perform all of the cleaning operations that NeMo Curator supports. +* ``Modify(UnicodeReformatter())``: Uses `ftfy `_ to fix broken Unicode characters. Modifies the `"text"` field of the datset by default. This can be changed by setting ``Modify(UnicodeReformatter(), text_field="my_field")``. +* ``Modify(NewlineNormalizer())``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters. +* ``Modify(UrlRemover())``: Uses regex to remove all urls in each document + +Any subset of these steps can be run at a time. + +Additionally, NeMo Curator has the ``text_cleaning`` CLI command that can perform the same functions: + +.. code-block:: bash + + text_cleaning \ + --input-data-dir=/path/to/input/ \ + --output-clean-dir=/path/to/output/ \ + --normalize-newlines \ + --remove-urls + +By default, the CLI will only perform unicode reformatting. Adding the ``--normalize-newlines`` and ``--remove-urls`` options add the other text cleaning options. + +------------------------ +Custom Text Cleaner +------------------------ +It's easy to write your own custom text cleaner. The implementation of ``UnicodeReformatter`` can be used as an example. + +.. code-block:: python + import ftfy + + from nemo_curator.modifiers import DocumentModifier + + + class UnicodeReformatter(DocumentModifier): + def __init__(self): + super().__init__() + + def modify_document(self, text: str) -> str: + return ftfy.fix_text(text) + +Simply define a new class that inherits from ``DocumentModifier`` and define the constructor and ``modify_text`` method. +Also, like the ``DocumentFilter`` class, ``modify_document`` can be annotated with ``batched`` to take in a pandas series of documents instead of a single document. +See the :ref:`document filtering page ` for more information. + +--------------------------- +Additional Resources +--------------------------- +* `Single GPU Tutorial `_ +* `ftfy `_ +* `Refined Web Paper `_ +* `Nemotron-CC Paper `_ \ No newline at end of file diff --git a/examples/identify_languages_and_fix_unicode.py b/examples/identify_languages.py similarity index 79% rename from examples/identify_languages_and_fix_unicode.py rename to examples/identify_languages.py index 92f628e3..2a090da0 100644 --- a/examples/identify_languages_and_fix_unicode.py +++ b/examples/identify_languages.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,11 @@ # limitations under the License. import argparse -import os import nemo_curator as nc from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import FastTextLangId -from nemo_curator.modifiers import UnicodeReformatter -from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk +from nemo_curator.utils.distributed_utils import get_client, read_data from nemo_curator.utils.file_utils import ( get_all_files_paths_under, separate_by_metadata, @@ -45,7 +43,6 @@ def main(args): # and see a list of supported languages here: # https://fasttext.cc/docs/en/language-identification.html model_path = "/path/to/model.bin" - target_language = "EN" language_field = "language" # Prepare samples for the classifier @@ -70,18 +67,6 @@ def main(args): metadata_field=language_field, ).compute() - # Read the language specific data and fix the unicode in it - lang_data_path = os.path.join(language_separated_output_path, target_language) - if not os.path.exists(lang_data_path): - raise RuntimeError(f"Dataset did not have language: {target_language}") - lang_data = load_dataset(lang_data_path) - - cleaner = nc.Modify(UnicodeReformatter()) - cleaned_data = cleaner(lang_data) - - # Write the cleaned_data - write_to_disk(cleaned_data.df, cleaned_data_output_path, write_to_filename=True) - def attach_args( parser=argparse.ArgumentParser( diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py index f05a3843..762a063c 100644 --- a/nemo_curator/scripts/text_cleaning.py +++ b/nemo_curator/scripts/text_cleaning.py @@ -14,9 +14,9 @@ import argparse -import nemo_curator +from nemo_curator import Modify, Sequential from nemo_curator.datasets import DocumentDataset -from nemo_curator.modifiers import UnicodeReformatter +from nemo_curator.modifiers import NewlineNormalizer, UnicodeReformatter, UrlRemover from nemo_curator.utils.distributed_utils import get_client, read_data, write_to_disk from nemo_curator.utils.file_utils import expand_outdir_and_mkdir, get_batched_files from nemo_curator.utils.script_utils import ArgumentHelper @@ -28,9 +28,14 @@ def main(args): # Make the output directories output_clean_dir = expand_outdir_and_mkdir(args.output_clean_dir) - cleaner = nemo_curator.Modify( - UnicodeReformatter(), text_field=args.input_text_field - ) + stages = [Modify(UnicodeReformatter(), text_field=args.input_text_field)] + + if args.normalize_newlines: + stages.append(Modify(NewlineNormalizer(), text_field=args.input_text_field)) + if args.remove_urls: + stages.append(Modify(UrlRemover, text_field=args.text_field)) + + cleaner = Sequential(stages) for files in get_batched_files( args.input_data_dir, @@ -79,6 +84,15 @@ def attach_args( argumentHelper.add_arg_input_text_field() argumentHelper.add_arg_output_file_type() argumentHelper.add_distributed_args() + argumentHelper.attach_bool_arg( + parser, + "normalize-newlines", + default=False, + help="Replace 3 or more consecutive newline characters in each document with only 2 newline characters.", + ) + argumentHelper.attach_bool_arg( + parser, "remove-urls", default=False, help="Removes all urls in each document." + ) parser.add_argument( "--output-clean-dir", type=str, From abcf93f6f259e28a8fe6555751a41278dea3abaa Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Wed, 5 Feb 2025 09:26:16 -0800 Subject: [PATCH 4/4] Address Sarah and Lawrence's reviews Signed-off-by: Ryan Wolf --- docs/user-guide/index.rst | 2 +- .../{textcleaning.rst => text-cleaning.rst} | 31 ++++++++++--------- docs/user-guide/text-curation.rst | 2 +- examples/README.md | 2 +- nemo_curator/modifiers/url_remover.py | 2 +- nemo_curator/scripts/text_cleaning.py | 2 +- 6 files changed, 22 insertions(+), 19 deletions(-) rename docs/user-guide/{textcleaning.rst => text-cleaning.rst} (60%) diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index ad8eb68c..b63e1b93 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -20,7 +20,7 @@ Text Curation Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages. :ref:`Text Cleaning ` - Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. + Many parts of the Internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. diff --git a/docs/user-guide/textcleaning.rst b/docs/user-guide/text-cleaning.rst similarity index 60% rename from docs/user-guide/textcleaning.rst rename to docs/user-guide/text-cleaning.rst index 9bb022ee..b9ffaa2f 100644 --- a/docs/user-guide/textcleaning.rst +++ b/docs/user-guide/text-cleaning.rst @@ -7,20 +7,30 @@ Text Cleaning -------------------- Overview -------------------- -Documents in datasets may contain improperly decoded characters (e.g. "The Mona Lisa doesn't have eyebrows." decoding as "The Mona Lisa doesn’t have eyebrows."), inconsistent line spacing, and many urls. -NeMo Curator provides a few modules that can help remove undesirable text from within individual documents. +Use NeMo Curator's text cleaning modules to remove undesirable text such as improperly decoded unicode characters, inconsistent line spacing, or excessive URLs from documents being pre-processed for dataset. + +For example, the input sentence `"The Mona Lisa doesn't have eyebrows."` from a given document may not have included a properly encoded apostrophe (`'`), resulting in the sentence decoding as `"The Mona Lisa doesn’t have eyebrows."` NeMo Curator enables you to easily run this document through the default `UnicodeReformatter()` module to detect and remove the unwanted text, or you can define your own custom unicode text cleaner tailored to your needs. -------------------- Use Cases -------------------- -* Fixing improperly decoded unicode characters from webpages. -* Standardizing document layout by removing excessive newlines. -* Removing URLs in documents. +* Fix improperly decoded Unicode characters from webpages. +* Standardize document layout by removing excessive newlines. +* Remove URLs in documents. -------------------- Modules -------------------- -NeMo Curator provides a collection of easy to use modules for cleaning text. +NeMo Curator provides the following modules for cleaning text: + +- ``UnicodeReformatter()``: Uses [ftfy](https://ftfy.readthedocs.io/en/latest/) to fix broken Unicode characters. Modifies the "text" field of the dataset by default. +- ``NewlineNormalizer()``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters. +- ``UrlRemover()``: Uses regex to remove all urls in each document. + +You can use these modules individually or sequentially in a cleaning pipeline. + +Consider the following example, which loads a dataset (`books.jsonl`), steps through each module in a cleaning pipeline, and outputs the processed dataset as `cleaned_books.jsonl`: + .. code-block:: python @@ -45,14 +55,7 @@ NeMo Curator provides a collection of easy to use modules for cleaning text. if __name__ == "__main__": main() -Here, we load a dataset and perform all of the cleaning operations that NeMo Curator supports. -* ``Modify(UnicodeReformatter())``: Uses `ftfy `_ to fix broken Unicode characters. Modifies the `"text"` field of the datset by default. This can be changed by setting ``Modify(UnicodeReformatter(), text_field="my_field")``. -* ``Modify(NewlineNormalizer())``: Uses regex to replace 3 or more consecutive newline characters in each document with only 2 newline characters. -* ``Modify(UrlRemover())``: Uses regex to remove all urls in each document - -Any subset of these steps can be run at a time. - -Additionally, NeMo Curator has the ``text_cleaning`` CLI command that can perform the same functions: +You can also perform text cleaning operations using the CLI by running the `text_cleaning` command: .. code-block:: bash diff --git a/docs/user-guide/text-curation.rst b/docs/user-guide/text-curation.rst index c015d4c8..a4cc83b0 100644 --- a/docs/user-guide/text-curation.rst +++ b/docs/user-guide/text-curation.rst @@ -17,7 +17,7 @@ Text Curation Large, unlabeled text corpora often contain a variety of languages. NeMo Curator provides utilities to identify languages. :ref:`Text Cleaning ` - Many parts of the internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. + Many parts of the Internet contained malformed or poorly formatted text. NeMo Curator can fix many of these issues with text. :ref:`GPU Accelerated Exact and Fuzzy Deduplication ` Both exact and fuzzy deduplication functionalities are supported in NeMo Curator and accelerated using RAPIDS cuDF. diff --git a/examples/README.md b/examples/README.md index 3e101a1e..29545978 100644 --- a/examples/README.md +++ b/examples/README.md @@ -14,7 +14,7 @@ These include: | exact_deduplication.py | Use the `ExactDuplicates` class to perform exact deduplication on text data. | | find_pii_and_deidentify.py | Use the `PiiModifier` and `Modify` classes to remove personally identifiable information from text data. | | fuzzy_deduplication.py | Use the `FuzzyDuplicatesConfig` and `FuzzyDuplicates` classes to perform fuzzy deduplication on text data. | -| identify_languages_and_fix_unicode.py | Use `FastTextLangId` to filter data by language, then fix the unicode in it. | +| identify_languages.py | Use `FastTextLangId` to filter data by language | | raw_download_common_crawl.py | Download the raw compressed WARC files from Common Crawl without extracting them. | | semdedup_example.py | Use the `SemDedup` class to perform semantic deduplication on text data. | | task_decontamination.py | Remove segments of downstream evaluation tasks from a dataset. | diff --git a/nemo_curator/modifiers/url_remover.py b/nemo_curator/modifiers/url_remover.py index 8c4fa9f2..85ebe4b6 100644 --- a/nemo_curator/modifiers/url_remover.py +++ b/nemo_curator/modifiers/url_remover.py @@ -20,7 +20,7 @@ class UrlRemover(DocumentModifier): """ - Removes all urls in a document. + Removes all URLs in a document. """ def __init__(self): diff --git a/nemo_curator/scripts/text_cleaning.py b/nemo_curator/scripts/text_cleaning.py index 762a063c..87d99099 100644 --- a/nemo_curator/scripts/text_cleaning.py +++ b/nemo_curator/scripts/text_cleaning.py @@ -91,7 +91,7 @@ def attach_args( help="Replace 3 or more consecutive newline characters in each document with only 2 newline characters.", ) argumentHelper.attach_bool_arg( - parser, "remove-urls", default=False, help="Removes all urls in each document." + parser, "remove-urls", default=False, help="Removes all URLs in each document." ) parser.add_argument( "--output-clean-dir",