diff --git a/nemo_curator/modifiers/__init__.py b/nemo_curator/modifiers/__init__.py index 3be9bedf..df3a8fb9 100644 --- a/nemo_curator/modifiers/__init__.py +++ b/nemo_curator/modifiers/__init__.py @@ -20,6 +20,7 @@ from .newline_normalizer import NewlineNormalizer from .pii_modifier import PiiModifier from .quotation_remover import QuotationRemover +from .regex_modifier import RegexModifier from .slicer import Slicer from .unicode_reformatter import UnicodeReformatter from .url_remover import UrlRemover @@ -32,6 +33,7 @@ "QuotationRemover", "LineRemover", "MarkdownRemover", + "RegexModifier", "PiiModifier", "NewlineNormalizer", "UrlRemover", diff --git a/nemo_curator/modifiers/regex_modifier.py b/nemo_curator/modifiers/regex_modifier.py new file mode 100644 index 00000000..11a65301 --- /dev/null +++ b/nemo_curator/modifiers/regex_modifier.py @@ -0,0 +1,139 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import Dict, List + +from nemo_curator.modifiers import DocumentModifier + +__all__ = ["RegexModifier"] + + +class RegexModifier(DocumentModifier): + """ + A class for modifying documents using regular expressions. + + This class applies a series of regex-based substitutions to an input text. + Each substitution rule is defined by a dictionary containing at least a + 'pattern' and a 'repl' key, and optionally a 'count' key to limit the number + of substitutions. + """ + + def __init__(self, regex_params_list: List[Dict]): + """ + Initialize the RegexModifier with a list of regex parameter dictionaries. + + Args: + regex_params_list (List[Dict]): List of dictionaries where each dictionary + contains keys 'pattern' and 'repl' (and optionally 'count'). Each + dictionary defines a regex substitution rule. + + Raises: + ValueError: If any dictionary in the list is missing the 'pattern' or 'repl' key. + """ + super().__init__() + self.regex_params_list = regex_params_list + + # verify all dicts in regex_params_list have "pattern" and "repl" keys + for regex_params_dict in self.regex_params_list: + if not "pattern" in regex_params_dict.keys(): + raise ValueError( + f"Need to have key 'pattern' in all entries of `regex_params_list`: {self.regex_params_list}" + ) + if not "repl" in regex_params_dict.keys(): + raise ValueError( + f"Need to have key 'repl' in all entries of `regex_params_list`: {self.regex_params_list}" + ) + + def modify_document(self, text: str) -> str: + """ + Modify the given text by applying regex substitutions as defined in regex_params_list. + + The process includes: + 1. Adding a space at the beginning and end of the text to help match whole words. + 2. Iteratively applying each regex substitution using the rules in regex_params_list. + 3. Removing any extra spaces that might result from the substitutions. + + Args: + text (str): The input text to modify. + + Returns: + str: The modified text after all regex substitutions have been applied. + """ + text_in = RegexModifier._add_start_end_spaces(text) + for regex_params in self.regex_params_list: + text_out = re.sub( + pattern=regex_params["pattern"], + repl=regex_params["repl"], + string=text_in, + # note: this count param is the maximum number of pattern occurrences to be replaced. + count=regex_params.get("count", 0), + ) + text_in = text_out + + text_out = RegexModifier._remove_extra_spaces(text_out) + + return text_out + + @staticmethod + def _remove_extra_spaces(input_string): + """ + Remove extra spaces between words and trim spaces at the start and end of the string. + + This method splits the input string by any whitespace and rejoins the tokens with + a single space, effectively collapsing multiple spaces into one and removing any + leading or trailing spaces. + + Args: + input_string (str): The string from which extra spaces should be removed. + + Returns: + str: The cleaned string with extra spaces removed. + + Examples: + >>> RegexModifier.remove_extra_spaces("abc xyz abc xyz") + 'abc xyz abc xyz' + >>> RegexModifier.remove_extra_spaces(" abc xyz ") + 'abc xyz' + """ + output_string = " ".join(input_string.split()) + return output_string + + @staticmethod + def _add_start_end_spaces(input_string): + """ + Add a single space at the beginning and end of the input string. + + This is useful when specifying regex patterns that require a word to have spaces + on both sides, ensuring that words at the boundaries of the string are correctly matched. + The method first normalizes the string by removing extra spaces, then pads it with a + space at both the start and the end. + + Args: + input_string (str): The original string to pad. + + Returns: + str: The padded string with a leading and trailing space. + + Examples: + >>> RegexModifier.add_start_end_spaces("abc xyz") + ' abc xyz ' + >>> RegexModifier.add_start_end_spaces(" abc xyz ") + ' abc xyz ' + """ + # ensure no extra spaces + no_extra_spaces_string = RegexModifier._remove_extra_spaces(input_string) + output_string = f" {no_extra_spaces_string} " + + return output_string diff --git a/tests/test_regex_modifier.py b/tests/test_regex_modifier.py new file mode 100644 index 00000000..dfaffada --- /dev/null +++ b/tests/test_regex_modifier.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_curator.modifiers.regex_modifier import RegexModifier + + +class TestRegexModifier: + + def test_modify_document(self): + # Test a simple replacement: "ö" should be normalized into "o", + # then remove anything other than alphanumeric characters and punctuations. + regex_params = [ + {"pattern": "ö", "repl": "o"}, + { + "pattern": "[^ !$%',-.0123456789;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/:]", + "repl": "", + }, + ] + modifier = RegexModifier(regex_params) + # "Nein, es ist möglich🙃 " -> replacement ö -> + # "Nein, es ist moglich🙃 " -> remove anything other than alphanumeric characters and punctuations + # "Nein, es ist moglich " -> remove extra spaces -> "Nein, es ist moglich" + assert ( + modifier.modify_document("Nein, es ist möglich🙃 ") + == "Nein, es ist moglich" + ) + + def test_modify_document_with_count(self): + # Test using the count parameter: only one occurrence should be replaced. + regex_params = [{"pattern": r"\ba\b", "repl": "b", "count": 1}] + modifier = RegexModifier(regex_params) + # Input "a a a" becomes " a a a " with extra spaces. + # With count=1, only the first "a" is replaced: " b a a " + # Removing extra spaces yields "b a a". + assert modifier.modify_document("a a a") == "b a a"