From a218b09dc6ccfd779c2345d9a9205812058edb3f Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 30 Oct 2024 15:58:09 -0700 Subject: [PATCH] remove embedding_max_mem_gb Signed-off-by: Sarah Yurick --- config/sem_dedup_config.yaml | 1 - docs/user-guide/semdedup.rst | 5 +---- nemo_curator/modules/config.py | 2 -- nemo_curator/modules/semantic_dedup.py | 9 +-------- nemo_curator/scripts/semdedup/compute_embeddings.py | 4 +--- nemo_curator/utils/file_utils.py | 7 +++++-- .../peft-curation-with-sdg/config/sem_dedup_config.yaml | 1 - 7 files changed, 8 insertions(+), 21 deletions(-) diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml index 75d1d656..39787d2f 100644 --- a/config/sem_dedup_config.yaml +++ b/config/sem_dedup_config.yaml @@ -6,7 +6,6 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 -embedding_max_mem_gb: 25 # Clustering configuration clustering_save_loc: "clustering_results" diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst index 893f13cd..31ba6efd 100644 --- a/docs/user-guide/semdedup.rst +++ b/docs/user-guide/semdedup.rst @@ -45,7 +45,6 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 - embedding_max_mem_gb: 25 # Clustering configuration clustering_save_loc: "clustering_results" @@ -96,7 +95,7 @@ The module supports various types of models, including: When changing the model, ensure that: 1. The model is compatible with the data type you're working with (primarily text for this module). -2. You adjust the ``embedding_batch_size`` and ``embedding_max_mem_gb`` parameters as needed, as different models may have different memory requirements. +2. You adjust the ``embedding_batch_size`` parameter as needed, as different models may have different memory requirements. 3. The chosen model is appropriate for the language or domain of your dataset. By selecting an appropriate embedding model, you can optimize the semantic deduplication process for your specific use case and potentially improve the quality of the deduplicated dataset. @@ -169,7 +168,6 @@ Use Individual Components # Step 1: Embedding Creation embedding_creator = EmbeddingCreator( embedding_model_name_or_path="path/to/pretrained/model", - embedding_max_mem_gb=32, embedding_batch_size=128, embedding_output_dir="path/to/output/embeddings", input_column="text", @@ -252,7 +250,6 @@ Parameters Key parameters in the configuration file include: - ``embedding_model_name_or_path``: Path or identifier for the pre-trained model used for embedding generation. -- ``embedding_max_mem_gb``: Maximum memory usage for the embedding process. - ``embedding_batch_size``: Number of samples to process in each embedding batch. - ``n_clusters``: Number of clusters for k-means clustering. - ``eps_to_extract``: Deduplication threshold. Higher values result in more aggressive deduplication. diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py index 048a1c5a..551f261e 100644 --- a/nemo_curator/modules/config.py +++ b/nemo_curator/modules/config.py @@ -123,7 +123,6 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc (str): Location to save embeddings. embedding_model_name_or_path (str): Model name or path for embeddings. embedding_batch_size (int): Inital Batch size for processing embeddings. - embedding_max_mem_gb (int): Maximum memory in GB for embeddings. clustering_save_loc (str): Location to save clustering results. n_clusters (int): Number of clusters. seed (int): Seed for clustering. @@ -144,7 +143,6 @@ class SemDedupConfig(BaseConfig): embeddings_save_loc: str = "embeddings" embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: int = 128 - embedding_max_mem_gb: int = 25 # Clustering config clustering_save_loc: str = "clustering_results" diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py index 5303d20e..0056cad6 100644 --- a/nemo_curator/modules/semantic_dedup.py +++ b/nemo_curator/modules/semantic_dedup.py @@ -53,7 +53,6 @@ @dataclass class EmbeddingConfig: model_name_or_path: str - max_mem_gb: int max_seq_length: int = None def __post_init__(self): @@ -99,9 +98,7 @@ def _mean_pooling(self, model_output, attention_mask): class EmbeddingCrossFitModel(HFModel): def __init__(self, config: EmbeddingConfig): self.config = config - super().__init__( - self.config.model_name_or_path, max_mem_gb=self.config.max_mem_gb - ) + super().__init__(self.config.model_name_or_path) def load_model(self, device="cuda"): model = EmbeddingPytorchModel(self.config) @@ -123,7 +120,6 @@ class EmbeddingCreator: def __init__( self, embedding_model_name_or_path: str, - embedding_max_mem_gb: str, embedding_batch_size: int, embedding_output_dir: str, input_column: str = "text", @@ -138,7 +134,6 @@ def __init__( Args: embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings. - embedding_max_mem_gb (str): Maximum memory usage for the embedding process. embedding_batch_size (int): Number of samples to process in each batch. embedding_output_dir (str): Directory path where embeddings will be saved. input_column (str): Column name from the data to be used for embedding generation, defaults to "text". @@ -161,7 +156,6 @@ def __init__( self.embeddings_config = EmbeddingConfig( model_name_or_path=embedding_model_name_or_path, - max_mem_gb=embedding_max_mem_gb, ) self.batch_size = embedding_batch_size self.logger = self._setup_logger(logger) @@ -595,7 +589,6 @@ def __init__( cache_dir = config.cache_dir self.embedding_creator = EmbeddingCreator( embedding_model_name_or_path=config.embedding_model_name_or_path, - embedding_max_mem_gb=config.embedding_max_mem_gb, embedding_batch_size=config.embedding_batch_size, input_column=input_column, embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc), diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py index af5c8655..a50cf47b 100644 --- a/nemo_curator/scripts/semdedup/compute_embeddings.py +++ b/nemo_curator/scripts/semdedup/compute_embeddings.py @@ -75,7 +75,6 @@ def main(args): # ddf = ddf.repartition(partition_size="64MB") embedding_creator = EmbeddingCreator( embedding_model_name_or_path=semdedup_config.embedding_model_name_or_path, - embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb, embedding_batch_size=semdedup_config.embedding_batch_size, embedding_output_dir=os.path.join( semdedup_config.cache_dir, semdedup_config.embeddings_save_loc @@ -110,8 +109,7 @@ def attach_args(): " input_column for specifying the input column for embeddings," " embeddings_save_loc for the location to save embeddings," " embedding_model_name_or_path for the model name or path for embeddings," - " embedding_batch_size for the batch size for processing embeddings," - " embedding_max_mem_gb for the maximum memory in GB for embeddings" + " embedding_batch_size for the batch size for processing embeddings" ), add_input_args=True, ) diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py index 55364f12..4bc6120e 100644 --- a/nemo_curator/utils/file_utils.py +++ b/nemo_curator/utils/file_utils.py @@ -134,8 +134,11 @@ def _update_filetype(file_set, old_file_type, new_file_type): new_file_type = "." + new_file_type updated_file_set = { - f"{os.path.splitext(file)[0]}{new_file_type}" - if file.endswith(old_file_type) else file + ( + f"{os.path.splitext(file)[0]}{new_file_type}" + if file.endswith(old_file_type) + else file + ) for file in file_set } return updated_file_set diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml index 205d5fcd..93ec29cb 100644 --- a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml +++ b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml @@ -6,7 +6,6 @@ num_files: 16 embeddings_save_loc: "embeddings" embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2" embedding_batch_size: 128 -embedding_max_mem_gb: 20 # Clustering configuration clustering_save_loc: "clustering_results"