-
Notifications
You must be signed in to change notification settings - Fork 112
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Shuoyang Ding <[email protected]>
- Loading branch information
Showing
3 changed files
with
187 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import re | ||
from typing import Dict, List | ||
|
||
from nemo_curator.modifiers import DocumentModifier | ||
|
||
__all__ = ["RegexModifier"] | ||
|
||
|
||
class RegexModifier(DocumentModifier): | ||
""" | ||
A class for modifying documents using regular expressions. | ||
This class applies a series of regex-based substitutions to an input text. | ||
Each substitution rule is defined by a dictionary containing at least a | ||
'pattern' and a 'repl' key, and optionally a 'count' key to limit the number | ||
of substitutions. | ||
""" | ||
|
||
def __init__(self, regex_params_list: List[Dict]): | ||
""" | ||
Initialize the RegexModifier with a list of regex parameter dictionaries. | ||
Args: | ||
regex_params_list (List[Dict]): List of dictionaries where each dictionary | ||
contains keys 'pattern' and 'repl' (and optionally 'count'). Each | ||
dictionary defines a regex substitution rule. | ||
Raises: | ||
ValueError: If any dictionary in the list is missing the 'pattern' or 'repl' key. | ||
""" | ||
super().__init__() | ||
self.regex_params_list = regex_params_list | ||
|
||
# verify all dicts in regex_params_list have "pattern" and "repl" keys | ||
for regex_params_dict in self.regex_params_list: | ||
if not "pattern" in regex_params_dict.keys(): | ||
raise ValueError( | ||
f"Need to have key 'pattern' in all entries of `regex_params_list`: {self.regex_params_list}" | ||
) | ||
if not "repl" in regex_params_dict.keys(): | ||
raise ValueError( | ||
f"Need to have key 'repl' in all entries of `regex_params_list`: {self.regex_params_list}" | ||
) | ||
|
||
def modify_document(self, text: str) -> str: | ||
""" | ||
Modify the given text by applying regex substitutions as defined in regex_params_list. | ||
The process includes: | ||
1. Adding a space at the beginning and end of the text to help match whole words. | ||
2. Iteratively applying each regex substitution using the rules in regex_params_list. | ||
3. Removing any extra spaces that might result from the substitutions. | ||
Args: | ||
text (str): The input text to modify. | ||
Returns: | ||
str: The modified text after all regex substitutions have been applied. | ||
""" | ||
text_in = RegexModifier._add_start_end_spaces(text) | ||
for regex_params in self.regex_params_list: | ||
text_out = re.sub( | ||
pattern=regex_params["pattern"], | ||
repl=regex_params["repl"], | ||
string=text_in, | ||
# note: this count param is the maximum number of pattern occurrences to be replaced. | ||
count=regex_params.get("count", 0), | ||
) | ||
text_in = text_out | ||
|
||
text_out = RegexModifier._remove_extra_spaces(text_out) | ||
|
||
return text_out | ||
|
||
@staticmethod | ||
def _remove_extra_spaces(input_string): | ||
""" | ||
Remove extra spaces between words and trim spaces at the start and end of the string. | ||
This method splits the input string by any whitespace and rejoins the tokens with | ||
a single space, effectively collapsing multiple spaces into one and removing any | ||
leading or trailing spaces. | ||
Args: | ||
input_string (str): The string from which extra spaces should be removed. | ||
Returns: | ||
str: The cleaned string with extra spaces removed. | ||
Examples: | ||
>>> RegexModifier.remove_extra_spaces("abc xyz abc xyz") | ||
'abc xyz abc xyz' | ||
>>> RegexModifier.remove_extra_spaces(" abc xyz ") | ||
'abc xyz' | ||
""" | ||
output_string = " ".join(input_string.split()) | ||
return output_string | ||
|
||
@staticmethod | ||
def _add_start_end_spaces(input_string): | ||
""" | ||
Add a single space at the beginning and end of the input string. | ||
This is useful when specifying regex patterns that require a word to have spaces | ||
on both sides, ensuring that words at the boundaries of the string are correctly matched. | ||
The method first normalizes the string by removing extra spaces, then pads it with a | ||
space at both the start and the end. | ||
Args: | ||
input_string (str): The original string to pad. | ||
Returns: | ||
str: The padded string with a leading and trailing space. | ||
Examples: | ||
>>> RegexModifier.add_start_end_spaces("abc xyz") | ||
' abc xyz ' | ||
>>> RegexModifier.add_start_end_spaces(" abc xyz ") | ||
' abc xyz ' | ||
""" | ||
# ensure no extra spaces | ||
no_extra_spaces_string = RegexModifier._remove_extra_spaces(input_string) | ||
output_string = f" {no_extra_spaces_string} " | ||
|
||
return output_string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from nemo_curator.modifiers.regex_modifier import RegexModifier | ||
|
||
|
||
class TestRegexModifier: | ||
|
||
def test_modify_document(self): | ||
# Test a simple replacement: "ö" should be normalized into "o", | ||
# then remove anything other than alphanumeric characters and punctuations. | ||
regex_params = [ | ||
{"pattern": "ö", "repl": "o"}, | ||
{ | ||
"pattern": "[^ !$%',-.0123456789;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/:]", | ||
"repl": "", | ||
}, | ||
] | ||
modifier = RegexModifier(regex_params) | ||
# "Nein, es ist möglich🙃 " -> replacement ö -> | ||
# "Nein, es ist moglich🙃 " -> remove anything other than alphanumeric characters and punctuations | ||
# "Nein, es ist moglich " -> remove extra spaces -> "Nein, es ist moglich" | ||
assert ( | ||
modifier.modify_document("Nein, es ist möglich🙃 ") | ||
== "Nein, es ist moglich" | ||
) | ||
|
||
def test_modify_document_with_count(self): | ||
# Test using the count parameter: only one occurrence should be replaced. | ||
regex_params = [{"pattern": r"\ba\b", "repl": "b", "count": 1}] | ||
modifier = RegexModifier(regex_params) | ||
# Input "a a a" becomes " a a a " with extra spaces. | ||
# With count=1, only the first "a" is replaced: " b a a " | ||
# Removing extra spaces yields "b a a". | ||
assert modifier.modify_document("a a a") == "b a a" |