From 2abbc16d9cea879c2889ab5558b4c31a213b5b4c Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 30 Jan 2025 15:09:11 -0800 Subject: [PATCH 1/5] quick fix Signed-off-by: Sarah Yurick --- .../modules/semantic_dedup/embeddings.py | 1 + .../modules/semantic_dedup/semdedup.py | 2 ++ tutorials/dapt-curation/README.md | 5 +++-- tutorials/dapt-curation/code/main.py | 7 ++----- tutorials/dapt-curation/code/utils.py | 20 ++++++------------- 5 files changed, 14 insertions(+), 21 deletions(-) diff --git a/nemo_curator/modules/semantic_dedup/embeddings.py b/nemo_curator/modules/semantic_dedup/embeddings.py index 4a0b638b..7c607b63 100644 --- a/nemo_curator/modules/semantic_dedup/embeddings.py +++ b/nemo_curator/modules/semantic_dedup/embeddings.py @@ -217,6 +217,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: ) ) else: + embedding_ddf = self.create_embeddings(dataset.df, self.input_column) ddf = DocumentDataset(embedding_ddf) self.logger.info( diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index a03d152b..0afc2cbc 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -33,6 +33,7 @@ def __init__( input_column: str = "text", id_column: str = "id", id_column_type: str = "int", + write_embeddings_to_disk: str = True, logger: Union[logging.Logger, str] = "./", ) -> None: """ @@ -50,6 +51,7 @@ def __init__( embedding_batch_size=config.embedding_batch_size, input_column=input_column, embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), + write_embeddings_to_disk=write_embeddings_to_disk, logger=logger, profile_dir=self.config.profile_dir, ) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 4f67c616..2da754fb 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -47,9 +47,10 @@ The tutorial follows the steps below:
After installing the NeMo Curator package, install the dependencies and run: ```bash -pip install -r code/requirements.txt cd code +pip install -r requirements.txt python main.py +# or python main.py --device "gpu" ``` -This will download chip-design related datasets and begin the data curation pipeline. +This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index 3ae5fe17..5f51ead8 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -37,11 +37,8 @@ ) import nemo_curator as nc -from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential +from nemo_curator import ScoreFilter, Sequential from nemo_curator.datasets import DocumentDataset -from nemo_curator.filters import RepeatingTopNGramsFilter, WordCountFilter -from nemo_curator.modifiers.pii_modifier import PiiModifier -from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.file_utils import ( get_all_files_paths_under, @@ -191,7 +188,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: duplicates = semantic_dedupe( dataset=gpu_dataset_text, sem_dedupe_config_yaml_path=sem_dedupe_config_yaml_path, - cache=CACHE_DIR, + cache_dir=CACHE_DIR, ) unique_ids = duplicates.df.to_backend("pandas").compute()["id"] semantic_dataset_text = DocumentDataset( diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index dc91b225..aebdfff4 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os -import re - -import dask.dataframe as dd -import pandas as pd -import yaml from nemo_curator import ( ExactDuplicates, @@ -33,7 +27,6 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import ( DocumentFilter, - RepeatedLinesFilter, RepeatedParagraphsFilter, RepeatingTopNGramsFilter, UrlsFilter, @@ -46,12 +39,7 @@ from nemo_curator.modifiers import DocumentModifier from nemo_curator.modifiers.pii_modifier import PiiModifier from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter -from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE -from nemo_curator.utils.distributed_utils import get_client -from nemo_curator.utils.file_utils import ( - expand_outdir_and_mkdir, - get_all_files_paths_under, -) +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir class QuotationUnifier(DocumentModifier): @@ -356,7 +344,11 @@ def semantic_dedupe( semdedup_config = SemDedupConfig.from_yaml(sem_dedupe_config_yaml_path) expand_outdir_and_mkdir(semdedup_config.cache_dir) - semdup = SemDedup(config=semdedup_config, id_column_type="str") + semdup = SemDedup( + config=semdedup_config, + id_column_type="str", + write_embeddings_to_disk=False, + ) duplicates = semdup(dataset) return duplicates From 474a5ddd27d415e6c6b282984b995483e133450b Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Thu, 30 Jan 2025 15:14:45 -0800 Subject: [PATCH 2/5] fix typehint Signed-off-by: Sarah Yurick --- nemo_curator/modules/semantic_dedup/semdedup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index 0afc2cbc..2498e8d1 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -33,7 +33,7 @@ def __init__( input_column: str = "text", id_column: str = "id", id_column_type: str = "int", - write_embeddings_to_disk: str = True, + write_embeddings_to_disk: bool = True, logger: Union[logging.Logger, str] = "./", ) -> None: """ From 8aadcb12c1be478165e3037e653bf898b87ef3b0 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 31 Jan 2025 11:06:29 -0800 Subject: [PATCH 3/5] add write_embeddings_to_disk to config file Signed-off-by: Sarah Yurick --- config/sem_dedup_config.yaml | 1 + docs/user-guide/semdedup.rst | 1 + nemo_curator/modules/config.py | 4 ++++ nemo_curator/modules/semantic_dedup/semdedup.py | 2 +- nemo_curator/scripts/semdedup/compute_embeddings.py | 1 + .../code/configs/text_semantic_dedupe_config.yaml | 1 + tutorials/dapt-curation/code/utils.py | 6 +----- 7 files changed, 10 insertions(+), 6 deletions(-) diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml index 39787d2f..08366d43 100644 --- a/config/sem_dedup_config.yaml +++ b/config/sem_dedup_config.yaml @@ -6,6 +6,7 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +write_embeddings_to_disk: true # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst index 31ba6efd..172b79d0 100644 --- a/docs/user-guide/semdedup.rst +++ b/docs/user-guide/semdedup.rst @@ -45,6 +45,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 + write_embeddings_to_disk: true # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index b43eb8fd..d29f02f4 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -145,6 +145,9 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc (str): Location to save embeddings. embedding_model_name_or_path (str): Model name or path for embeddings. embedding_batch_size (int): Inital Batch size for processing embeddings. + write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True. + We recommend setting this to False when you have a delayed pipeline. + Setting it to False can lead to more memory overhead. clustering_save_loc (str): Location to save clustering results. n_clusters (int): Number of clusters. seed (int): Seed for clustering. @@ -165,6 +168,7 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc: str = "embeddings" embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: int = 128 + write_embeddings_to_disk: bool = True # Clustering config clustering_save_loc: str = "clustering_results" diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index 2498e8d1..d4108579 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -51,7 +51,7 @@ def __init__( embedding_batch_size=config.embedding_batch_size, input_column=input_column, embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), - write_embeddings_to_disk=write_embeddings_to_disk, + write_embeddings_to_disk=config.write_embeddings_to_disk, logger=logger, profile_dir=self.config.profile_dir, ) diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py index e46c9d01..1712b2cf 100644 --- a/nemo_curator/scripts/semdedup/compute_embeddings.py +++ b/nemo_curator/scripts/semdedup/compute_embeddings.py @@ -80,6 +80,7 @@ def main(args): semdedup_config.cache_dir, semdedup_config.embeddings_save_loc ), input_column=args.input_text_field, + write_embeddings_to_disk=args.write_embeddings_to_disk, logger=logger, write_to_filename=True, ) diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index 17e4c17c..5b8e63b7 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -6,6 +6,7 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +write_embeddings_to_disk: false # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index aebdfff4..2d601688 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -344,11 +344,7 @@ def semantic_dedupe( semdedup_config = SemDedupConfig.from_yaml(sem_dedupe_config_yaml_path) expand_outdir_and_mkdir(semdedup_config.cache_dir) - semdup = SemDedup( - config=semdedup_config, - id_column_type="str", - write_embeddings_to_disk=False, - ) + semdup = SemDedup(config=semdedup_config, id_column_type="str") duplicates = semdup(dataset) return duplicates From 48639c2de122f87ed3970c73d452964ee4b0874b Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 31 Jan 2025 11:15:29 -0800 Subject: [PATCH 4/5] minor fixes Signed-off-by: Sarah Yurick --- nemo_curator/modules/semantic_dedup/semdedup.py | 1 - nemo_curator/scripts/semdedup/compute_embeddings.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index d4108579..eff5e2ec 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -33,7 +33,6 @@ def __init__( input_column: str = "text", id_column: str = "id", id_column_type: str = "int", - write_embeddings_to_disk: bool = True, logger: Union[logging.Logger, str] = "./", ) -> None: """ diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py index 1712b2cf..63fe3731 100644 --- a/nemo_curator/scripts/semdedup/compute_embeddings.py +++ b/nemo_curator/scripts/semdedup/compute_embeddings.py @@ -80,7 +80,7 @@ def main(args): semdedup_config.cache_dir, semdedup_config.embeddings_save_loc ), input_column=args.input_text_field, - write_embeddings_to_disk=args.write_embeddings_to_disk, + write_embeddings_to_disk=semdedup_config.write_embeddings_to_disk, logger=logger, write_to_filename=True, ) From 6feafc2565053af3c18a3baad7ce208c58564560 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 31 Jan 2025 12:26:27 -0800 Subject: [PATCH 5/5] fix readme Signed-off-by: Sarah Yurick --- tutorials/dapt-curation/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 2da754fb..55ddb2b5 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -49,8 +49,7 @@ After installing the NeMo Curator package, install the dependencies and run: ```bash cd code pip install -r requirements.txt -python main.py -# or python main.py --device "gpu" +python main.py --device "gpu" ``` This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.