diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml index 39787d2f..08366d43 100644 --- a/config/sem_dedup_config.yaml +++ b/config/sem_dedup_config.yaml @@ -6,6 +6,7 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +write_embeddings_to_disk: true # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst index 31ba6efd..172b79d0 100644 --- a/docs/user-guide/semdedup.rst +++ b/docs/user-guide/semdedup.rst @@ -45,6 +45,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 + write_embeddings_to_disk: true # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index b43eb8fd..d29f02f4 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -145,6 +145,9 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc (str): Location to save embeddings. embedding_model_name_or_path (str): Model name or path for embeddings. embedding_batch_size (int): Inital Batch size for processing embeddings. + write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True. + We recommend setting this to False when you have a delayed pipeline. + Setting it to False can lead to more memory overhead. clustering_save_loc (str): Location to save clustering results. n_clusters (int): Number of clusters. seed (int): Seed for clustering. @@ -165,6 +168,7 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc: str = "embeddings" embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: int = 128 + write_embeddings_to_disk: bool = True # Clustering config clustering_save_loc: str = "clustering_results" diff --git a/nemo_curator/modules/semantic_dedup/embeddings.py b/nemo_curator/modules/semantic_dedup/embeddings.py index 4a0b638b..7c607b63 100644 --- a/nemo_curator/modules/semantic_dedup/embeddings.py +++ b/nemo_curator/modules/semantic_dedup/embeddings.py @@ -217,6 +217,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: ) ) else: + embedding_ddf = self.create_embeddings(dataset.df, self.input_column) ddf = DocumentDataset(embedding_ddf) self.logger.info( diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index a03d152b..eff5e2ec 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -50,6 +50,7 @@ def __init__( embedding_batch_size=config.embedding_batch_size, input_column=input_column, embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), + write_embeddings_to_disk=config.write_embeddings_to_disk, logger=logger, profile_dir=self.config.profile_dir, ) diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py index e46c9d01..63fe3731 100644 --- a/nemo_curator/scripts/semdedup/compute_embeddings.py +++ b/nemo_curator/scripts/semdedup/compute_embeddings.py @@ -80,6 +80,7 @@ def main(args): semdedup_config.cache_dir, semdedup_config.embeddings_save_loc ), input_column=args.input_text_field, + write_embeddings_to_disk=semdedup_config.write_embeddings_to_disk, logger=logger, write_to_filename=True, ) diff --git a/tutorials/dapt-curation/README.md b/tutorials/dapt-curation/README.md index 4f67c616..55ddb2b5 100755 --- a/tutorials/dapt-curation/README.md +++ b/tutorials/dapt-curation/README.md @@ -47,9 +47,9 @@ The tutorial follows the steps below:
After installing the NeMo Curator package, install the dependencies and run: ```bash -pip install -r code/requirements.txt cd code -python main.py +pip install -r requirements.txt +python main.py --device "gpu" ``` -This will download chip-design related datasets and begin the data curation pipeline. +This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU. diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index 17e4c17c..5b8e63b7 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -6,6 +6,7 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +write_embeddings_to_disk: false # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/tutorials/dapt-curation/code/main.py b/tutorials/dapt-curation/code/main.py index 3ae5fe17..5f51ead8 100755 --- a/tutorials/dapt-curation/code/main.py +++ b/tutorials/dapt-curation/code/main.py @@ -37,11 +37,8 @@ ) import nemo_curator as nc -from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential +from nemo_curator import ScoreFilter, Sequential from nemo_curator.datasets import DocumentDataset -from nemo_curator.filters import RepeatingTopNGramsFilter, WordCountFilter -from nemo_curator.modifiers.pii_modifier import PiiModifier -from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter from nemo_curator.utils.distributed_utils import get_client from nemo_curator.utils.file_utils import ( get_all_files_paths_under, @@ -191,7 +188,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None: duplicates = semantic_dedupe( dataset=gpu_dataset_text, sem_dedupe_config_yaml_path=sem_dedupe_config_yaml_path, - cache=CACHE_DIR, + cache_dir=CACHE_DIR, ) unique_ids = duplicates.df.to_backend("pandas").compute()["id"] semantic_dataset_text = DocumentDataset( diff --git a/tutorials/dapt-curation/code/utils.py b/tutorials/dapt-curation/code/utils.py index dc91b225..2d601688 100755 --- a/tutorials/dapt-curation/code/utils.py +++ b/tutorials/dapt-curation/code/utils.py @@ -12,13 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os -import re - -import dask.dataframe as dd -import pandas as pd -import yaml from nemo_curator import ( ExactDuplicates, @@ -33,7 +27,6 @@ from nemo_curator.datasets import DocumentDataset from nemo_curator.filters import ( DocumentFilter, - RepeatedLinesFilter, RepeatedParagraphsFilter, RepeatingTopNGramsFilter, UrlsFilter, @@ -46,12 +39,7 @@ from nemo_curator.modifiers import DocumentModifier from nemo_curator.modifiers.pii_modifier import PiiModifier from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter -from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE -from nemo_curator.utils.distributed_utils import get_client -from nemo_curator.utils.file_utils import ( - expand_outdir_and_mkdir, - get_all_files_paths_under, -) +from nemo_curator.utils.file_utils import expand_outdir_and_mkdir class QuotationUnifier(DocumentModifier):