From 006a3cf17c5b7b82ef392b32c8190127b59bddb6 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Sun, 23 Feb 2025 16:03:20 -0800 Subject: [PATCH] update config to include all params Signed-off-by: Sarah Yurick --- config/sem_dedup_config.yaml | 19 ++--- docs/user-guide/semdedup.rst | 19 ++--- nemo_curator/modules/config.py | 72 ++++++++++++------- .../modules/semantic_dedup/clusteringmodel.py | 18 ++--- .../semanticclusterleveldedup.py | 10 +-- .../modules/semantic_dedup/semdedup.py | 14 +++- nemo_curator/scripts/semdedup/clustering.py | 3 +- .../scripts/semdedup/extract_dedup_data.py | 1 - tests/test_semdedup.py | 2 - .../configs/text_semantic_dedupe_config.yaml | 19 ++--- .../config/sem_dedup_config.yaml | 20 +++--- 11 files changed, 114 insertions(+), 83 deletions(-) diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml index 08366d43..2b1fdd88 100644 --- a/config/sem_dedup_config.yaml +++ b/config/sem_dedup_config.yaml @@ -3,22 +3,23 @@ cache_dir: "semdedup_cache" num_files: 16 # Embeddings configuration -embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +embeddings_save_loc: "embeddings" +embedding_pooling_strategy: "mean_pooling" +embedding_column: "embeddings" write_embeddings_to_disk: true +write_to_filename: false # Clustering configuration -clustering_save_loc: "clustering_results" -n_clusters: 1000 -seed: 1234 max_iter: 100 -kmeans_with_cos_dist: false - -# Semdedup configuration -which_to_keep: "hard" -largest_cluster_size_to_process: 100000 +n_clusters: 1000 +clustering_save_loc: "clustering_results" sim_metric: "cosine" +which_to_keep: "hard" +sort_clusters: true +kmeans_with_cos_dist: false +partition_size: "2gb" # Extract dedup configuration eps_thresholds: diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst index 172b79d0..79ffcb57 100644 --- a/docs/user-guide/semdedup.rst +++ b/docs/user-guide/semdedup.rst @@ -42,22 +42,23 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here num_files: -1 # Embeddings configuration - embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 + embeddings_save_loc: "embeddings" + embedding_pooling_strategy: "mean_pooling" + embedding_column: "embeddings" write_embeddings_to_disk: true + write_to_filename: false # Clustering configuration - clustering_save_loc: "clustering_results" - n_clusters: 1000 - seed: 1234 max_iter: 100 - kmeans_with_cos_dist: false - - # Semdedup configuration - which_to_keep: "hard" - largest_cluster_size_to_process: 100000 + n_clusters: 1000 + clustering_save_loc: "clustering_results" sim_metric: "cosine" + which_to_keep: "hard" + sort_clusters: true + kmeans_with_cos_dist: false + partition_size: "2gb" # Extract dedup configuration eps_thresholds: diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index 67bf06af..2c9e409d 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -147,26 +147,46 @@ class SemDedupConfig(BaseConfig): Attributes: cache_dir (str): Directory to store cache. - profile_dir (Optional[str]): If specified directory to write dask profile. Default is None. - cache_dir (str): Directory to store cache. + profile_dir (Optional[str]): If specified, directory to write Dask profile. + Default is None. num_files (int): Number of files. Default is -1, meaning all files. - embeddings_save_loc (str): Location to save embeddings. + embedding_model_name_or_path (str): Model name or path for embeddings. - embedding_batch_size (int): Inital Batch size for processing embeddings. - embedding_pooling_strategy (str): Strategy for pooling embeddings, either "mean_pooling" or "last_token". Defaults to "mean_pooling". - write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True. + Default is "sentence-transformers/all-MiniLM-L6-v2". + embedding_batch_size (int): Initial batch size for processing embeddings. + Default is 128. + embeddings_save_loc (str): Location to save embeddings. + Default is "embeddings". + embedding_max_mem_gb (int): Maximum memory usage in GB for the embedding process. + If None, it defaults to the available GPU memory minus 4 GB. + embedding_pooling_strategy (str): Strategy for pooling embeddings, either + "mean_pooling" or "last_token". Default is "mean_pooling". + embedding_column (str): The column name that stores the embeddings. + Default is "embeddings". + write_embeddings_to_disk (bool): If True, saves the embeddings to disk. We recommend setting this to False when you have a delayed pipeline. - Setting it to False can lead to more memory overhead. + Setting it to False can lead to more memory overhead. Default is True. + write_to_filename (bool): If True, saves the embeddings to the same filename as input files. + Default False. + + max_iter (int): Maximum iterations for clustering. Default is 100. + n_clusters (int): Number of clusters. Default is 1000. clustering_save_loc (str): Location to save clustering results. - n_clusters (int): Number of clusters. - seed (int): Seed for clustering. - max_iter (int): Maximum iterations for clustering. - kmeans_with_cos_dist (bool): Use KMeans with cosine distance. - which_to_keep (str): Which duplicates to keep. - largest_cluster_size_to_process (int): Largest cluster size to process. + Default is "clustering_results". sim_metric (str): Similarity metric for deduplication. - eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically similar or not. + Default is "cosine". + which_to_keep (str): Method to determine which duplicates to keep. + Default is "hard". + sort_clusters (bool): Whether to sort clusters. Default is True. + kmeans_with_cos_dist (bool): Whether or not to use KMeans with cosine distance. + Default is False. + partition_size (str): The size of data partition with which to run KMeans. + Default is "2gb". + + eps_thresholds (List[float]): Epsilon thresholds to calculate if semantically + similar or not. Default is [0.01, 0.001]. eps_to_extract (float): Epsilon value to extract deduplicated data. + Default is 0.01. """ cache_dir: str @@ -174,24 +194,24 @@ class SemDedupConfig(BaseConfig): num_files: int = -1 # Embeddings - embeddings_save_loc: str = "embeddings" embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: int = 128 - # Options: "mean_pooling", "last_token" + embeddings_save_loc: str = "embeddings" + embedding_max_mem_gb: Optional[int] = None embedding_pooling_strategy: str = "mean_pooling" + embedding_column: str = "embeddings" write_embeddings_to_disk: bool = True + write_to_filename: bool = False - # Clustering config - clustering_save_loc: str = "clustering_results" - n_clusters: int = 1000 - seed: int = 1234 + # Clustering max_iter: int = 100 - kmeans_with_cos_dist: bool = False - - # Semdedup config - which_to_keep: str = "hard" - largest_cluster_size_to_process: int = 100000 + n_clusters: int = 1000 + clustering_save_loc: str = "clustering_results" sim_metric: str = "cosine" + which_to_keep: str = "hard" + sort_clusters: bool = True + kmeans_with_cos_dist: bool = False + partition_size: str = "2gb" # Extract dedup config eps_thresholds: List[float] = field(default_factory=lambda: [0.01, 0.001]) diff --git a/nemo_curator/modules/semantic_dedup/clusteringmodel.py b/nemo_curator/modules/semantic_dedup/clusteringmodel.py index e9100255..2595465e 100644 --- a/nemo_curator/modules/semantic_dedup/clusteringmodel.py +++ b/nemo_curator/modules/semantic_dedup/clusteringmodel.py @@ -54,7 +54,7 @@ def __init__( max_iter: int, n_clusters: int, clustering_output_dir: str, - embedding_col: str = "embeddings", + embedding_column: str = "embeddings", sim_metric: str = "cosine", which_to_keep: str = "hard", sort_clusters: bool = True, @@ -71,7 +71,7 @@ def __init__( max_iter (int): Maximum number of iterations for the clustering algorithm. n_clusters (int): The number of clusters to form. clustering_output_dir (str): Directory path where clustering results will be saved. - embedding_col (str): Column name where the embeddings are stored. + embedding_column (str): Column name where the embeddings are stored. sim_metric (str): Similarity metric to use for clustering, default is "cosine". which_to_keep (str): Strategy to decide which duplicates to keep; default is "hard". sort_clusters (bool): Whether to sort clusters, default is True. @@ -86,7 +86,7 @@ def __init__( self.max_iter = max_iter self.n_clusters = n_clusters self.clustering_output_dir = clustering_output_dir - self.embedding_col = embedding_col + self.embedding_column = embedding_column self.sim_metric = sim_metric self.keep_hard = which_to_keep == "hard" self.kmeans_with_cos_dist = kmeans_with_cos_dist @@ -117,14 +117,14 @@ def _setup_logger(self, logger): def __call__(self, embeddings_dataset: DocumentDataset): embeddings_df = embeddings_dataset.df - if self.embedding_col not in embeddings_df.columns: + if self.embedding_column not in embeddings_df.columns: raise ValueError( - f"Expected embedding column '{self.embedding_col}'" + f"Expected embedding column '{self.embedding_column}'" f" to be in dataset. Only found columns {embeddings_df.columns}" ) with performance_report_if_with_ts_suffix(self.profile_dir, "clustering-model"): - embeddings_df = embeddings_df[[self.id_col, self.embedding_col]] + embeddings_df = embeddings_df[[self.id_col, self.embedding_column]] embeddings_df = embeddings_df.repartition( partition_size=self.partition_size ) @@ -147,7 +147,7 @@ def __call__(self, embeddings_dataset: DocumentDataset): embeddings_df = embeddings_df.to_backend("cudf") cupy_darr = embeddings_df.map_partitions( - get_embedding_ar, self.embedding_col, meta=cp.ndarray([1, 1]) + get_embedding_ar, self.embedding_column, meta=cp.ndarray([1, 1]) ) cupy_darr.compute_chunk_sizes() t0 = time.time() @@ -171,7 +171,7 @@ def __call__(self, embeddings_dataset: DocumentDataset): meta_df["dist_to_cent"] = cp.zeros(1) embeddings_df = embeddings_df.map_partitions( add_dist_to_cents, - embedding_col=self.embedding_col, + embedding_col=self.embedding_column, centroids=kmeans.cluster_centers_, meta=meta_df, ) @@ -213,7 +213,7 @@ def __call__(self, embeddings_dataset: DocumentDataset): output_sorted_clusters_dir=os.path.join( self.clustering_output_dir, "sorted" ), - embedding_col=self.embedding_col, + embedding_col=self.embedding_column, sim_metric=self.sim_metric, keep_hard=self.keep_hard, kmeans_with_cos_dist=self.kmeans_with_cos_dist, diff --git a/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py b/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py index 4329c2b0..65ff9b68 100644 --- a/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py +++ b/nemo_curator/modules/semantic_dedup/semanticclusterleveldedup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ def __init__( id_column_type: str, which_to_keep: str, output_dir: str, - embedding_col: str = "embeddings", + embedding_column: str = "embeddings", logger: Union[logging.Logger, str] = "./", profile_dir: Optional[str] = None, ) -> None: @@ -57,7 +57,7 @@ def __init__( id_column_type (str): Data type of the ID column. which_to_keep (str): Strategy for which duplicate to keep. output_dir (str): Directory to save output files. - embedding_col (str): Column where the embeddings are stored. + embedding_column (str): Column where the embeddings are stored. logger (Union[logging.Logger, str]): Logger instance or path to the log file directory. profile_dir (str): If specified directory to write dask profile. Default is None. """ @@ -72,7 +72,7 @@ def __init__( output_dir, "semdedup_pruning_tables" ) self.computed_semantic_match_dfs = False - self.embedding_col = embedding_col + self.embedding_column = embedding_column self.logger = self._setup_logger(logger) self.profile_dir = profile_dir @@ -132,7 +132,7 @@ def compute_semantic_match_dfs( id_col_type=self.id_col_type, eps_list=eps_list, output_dir=self.semdedup_pruning_tables_dir, - embedding_col=self.embedding_col, + embedding_col=self.embedding_column, which_to_keep=self.which_to_keep, ) ) diff --git a/nemo_curator/modules/semantic_dedup/semdedup.py b/nemo_curator/modules/semantic_dedup/semdedup.py index a8c66e31..f27f6a03 100644 --- a/nemo_curator/modules/semantic_dedup/semdedup.py +++ b/nemo_curator/modules/semantic_dedup/semdedup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -50,10 +50,13 @@ def __init__( self.embedding_creator = EmbeddingCreator( embedding_model_name_or_path=config.embedding_model_name_or_path, embedding_batch_size=config.embedding_batch_size, + embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), + embedding_max_mem_gb=config.embedding_max_mem_gb, embedding_pooling_strategy=config.embedding_pooling_strategy, input_column=input_column, - embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), + embedding_column=config.embedding_column, write_embeddings_to_disk=config.write_embeddings_to_disk, + write_to_filename=config.write_to_filename, logger=logger, profile_dir=self.config.profile_dir, ) @@ -62,6 +65,12 @@ def __init__( max_iter=config.max_iter, n_clusters=config.n_clusters, clustering_output_dir=os.path.join(cache_dir, config.clustering_save_loc), + embedding_column=config.embedding_column, + sim_metric=config.sim_metric, + which_to_keep=config.which_to_keep, + sort_clusters=config.sort_clusters, + kmeans_with_cos_dist=config.kmeans_with_cos_dist, + partition_size=config.partition_size, logger=logger, profile_dir=self.config.profile_dir, ) @@ -77,6 +86,7 @@ def __init__( id_column_type=id_column_type, which_to_keep=config.which_to_keep, output_dir=os.path.join(cache_dir, config.clustering_save_loc), + embedding_column=config.embedding_column, logger=logger, profile_dir=self.config.profile_dir, ) diff --git a/nemo_curator/scripts/semdedup/clustering.py b/nemo_curator/scripts/semdedup/clustering.py index db4885c3..7f970336 100644 --- a/nemo_curator/scripts/semdedup/clustering.py +++ b/nemo_curator/scripts/semdedup/clustering.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -93,7 +93,6 @@ def attach_args(): " cache_dir for the directory to store cache," " clustering_save_loc for the location to save clustering results," " n_clusters for the number of clusters," - " seed for the seed for clustering," " max_iter for the maximum iterations for clustering," " kmeans_with_cos_dist for using K-Means with cosine distance." ), diff --git a/nemo_curator/scripts/semdedup/extract_dedup_data.py b/nemo_curator/scripts/semdedup/extract_dedup_data.py index b6ffaebc..788c02bd 100755 --- a/nemo_curator/scripts/semdedup/extract_dedup_data.py +++ b/nemo_curator/scripts/semdedup/extract_dedup_data.py @@ -72,7 +72,6 @@ def attach_args(): "Important configuration parameters include:" " cache_dir for the directory to store cache" " which_to_keep for specifying which duplicates to keep," - " largest_cluster_size_to_process for the largest cluster size to process," " sim_metric for the similarity metric for deduplication," " eps_thresholds for epsilon thresholds to calculate if semantically similar or not" " and eps_to_extract for the epsilon value to extract deduplicated data." diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py index 06d633f3..344e5469 100644 --- a/tests/test_semdedup.py +++ b/tests/test_semdedup.py @@ -81,7 +81,6 @@ def test_sem_dedup( cache_dir = os.path.join(tmpdir, "test_sem_dedup_cache") config = SemDedupConfig( cache_dir=cache_dir, - seed=42, n_clusters=n_clusters, eps_thresholds=[0.10], eps_to_extract=0.10, @@ -120,7 +119,6 @@ def test_no_sem_dedup( cache_dir = os.path.join(tmpdir, "test_no_sem_dedup") config = SemDedupConfig( cache_dir=cache_dir, - seed=42, n_clusters=n_clusters, eps_thresholds=[0.10], eps_to_extract=0.10, diff --git a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml index 2e7ddad5..4887a5f9 100644 --- a/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml +++ b/tutorials/dapt-curation/code/configs/text_semantic_dedupe_config.yaml @@ -3,22 +3,23 @@ cache_dir: "workspace/semdedup_cache/text" num_files: 16 # Embeddings configuration -embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +embeddings_save_loc: "embeddings" +embedding_pooling_strategy: "mean_pooling" +embedding_column: "embeddings" write_embeddings_to_disk: true +write_to_filename: false # Clustering configuration -clustering_save_loc: "clustering_results" -n_clusters: 20 -seed: 1234 max_iter: 100 -kmeans_with_cos_dist: false - -# Semdedup configuration -which_to_keep: "hard" -largest_cluster_size_to_process: 100000 +n_clusters: 20 +clustering_save_loc: "clustering_results" sim_metric: "cosine" +which_to_keep: "hard" +sort_clusters: true +kmeans_with_cos_dist: false +partition_size: "2gb" # Extract dedup configuration eps_thresholds: diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml index 93ec29cb..6f8004c4 100644 --- a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml +++ b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml @@ -3,21 +3,23 @@ cache_dir: "_temp/semdedup_cache" num_files: 16 # Embeddings configuration -embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 +embeddings_save_loc: "embeddings" +embedding_pooling_strategy: "mean_pooling" +embedding_column: "embeddings" +write_embeddings_to_disk: true +write_to_filename: false # Clustering configuration -clustering_save_loc: "clustering_results" -n_clusters: 20 -seed: 1234 max_iter: 100 -kmeans_with_cos_dist: false - -# Semdedup configuration -which_to_keep: "hard" -largest_cluster_size_to_process: 100000 +n_clusters: 20 +clustering_save_loc: "clustering_results" sim_metric: "cosine" +which_to_keep: "hard" +sort_clusters: true +kmeans_with_cos_dist: false +partition_size: "2gb" # Extract dedup configuration eps_thresholds: