remove embedding_max_mem_gb

Signed-off-by: Sarah Yurick <[email protected]>
NVIDIA · Oct 30, 2024 · a218b09 · a218b09
1 parent 579ce27
commit a218b09
Show file tree

Hide file tree

Showing 7 changed files with 8 additions and 21 deletions.
diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
@@ -6,7 +6,6 @@ num_files: 16
 embeddings_save_loc: "embeddings"
 embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
 embedding_batch_size: 128
-embedding_max_mem_gb: 25
 
 # Clustering configuration
 clustering_save_loc: "clustering_results"

diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
@@ -45,7 +45,6 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
     embeddings_save_loc: "embeddings"
     embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
     embedding_batch_size: 128
-    embedding_max_mem_gb: 25
 
     # Clustering configuration
     clustering_save_loc: "clustering_results"
@@ -96,7 +95,7 @@ The module supports various types of models, including:
 When changing the model, ensure that:
 
 1. The model is compatible with the data type you're working with (primarily text for this module).
-2. You adjust the ``embedding_batch_size`` and ``embedding_max_mem_gb`` parameters as needed, as different models may have different memory requirements.
+2. You adjust the ``embedding_batch_size`` parameter as needed, as different models may have different memory requirements.
 3. The chosen model is appropriate for the language or domain of your dataset.
 
 By selecting an appropriate embedding model, you can optimize the semantic deduplication process for your specific use case and potentially improve the quality of the deduplicated dataset.
@@ -169,7 +168,6 @@ Use Individual Components
     # Step 1: Embedding Creation
     embedding_creator = EmbeddingCreator(
         embedding_model_name_or_path="path/to/pretrained/model",
-        embedding_max_mem_gb=32,
         embedding_batch_size=128,
         embedding_output_dir="path/to/output/embeddings",
         input_column="text",
@@ -252,7 +250,6 @@ Parameters
 Key parameters in the configuration file include:
 
 - ``embedding_model_name_or_path``: Path or identifier for the pre-trained model used for embedding generation.
-- ``embedding_max_mem_gb``: Maximum memory usage for the embedding process.
 - ``embedding_batch_size``: Number of samples to process in each embedding batch.
 - ``n_clusters``: Number of clusters for k-means clustering.
 - ``eps_to_extract``: Deduplication threshold. Higher values result in more aggressive deduplication.

diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
@@ -123,7 +123,6 @@ class SemDedupConfig(BaseConfig):
         embeddings_save_loc (str): Location to save embeddings.
         embedding_model_name_or_path (str): Model name or path for embeddings.
         embedding_batch_size (int): Inital Batch size for processing embeddings.
-        embedding_max_mem_gb (int): Maximum memory in GB for embeddings.
         clustering_save_loc (str): Location to save clustering results.
         n_clusters (int): Number of clusters.
         seed (int): Seed for clustering.
@@ -144,7 +143,6 @@ class SemDedupConfig(BaseConfig):
     embeddings_save_loc: str = "embeddings"
     embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
     embedding_batch_size: int = 128
-    embedding_max_mem_gb: int = 25
 
     # Clustering config
     clustering_save_loc: str = "clustering_results"

diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py
@@ -53,7 +53,6 @@
 @dataclass
 class EmbeddingConfig:
     model_name_or_path: str
-    max_mem_gb: int
     max_seq_length: int = None
 
     def __post_init__(self):
@@ -99,9 +98,7 @@ def _mean_pooling(self, model_output, attention_mask):
 class EmbeddingCrossFitModel(HFModel):
     def __init__(self, config: EmbeddingConfig):
         self.config = config
-        super().__init__(
-            self.config.model_name_or_path, max_mem_gb=self.config.max_mem_gb
-        )
+        super().__init__(self.config.model_name_or_path)
 
     def load_model(self, device="cuda"):
         model = EmbeddingPytorchModel(self.config)
@@ -123,7 +120,6 @@ class EmbeddingCreator:
     def __init__(
         self,
         embedding_model_name_or_path: str,
-        embedding_max_mem_gb: str,
         embedding_batch_size: int,
         embedding_output_dir: str,
         input_column: str = "text",
@@ -138,7 +134,6 @@ def __init__(
 
         Args:
             embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings.
-            embedding_max_mem_gb (str): Maximum memory usage for the embedding process.
             embedding_batch_size (int): Number of samples to process in each batch.
             embedding_output_dir (str): Directory path where embeddings will be saved.
             input_column (str): Column name from the data to be used for embedding generation, defaults to "text".
@@ -161,7 +156,6 @@ def __init__(
 
         self.embeddings_config = EmbeddingConfig(
             model_name_or_path=embedding_model_name_or_path,
-            max_mem_gb=embedding_max_mem_gb,
         )
         self.batch_size = embedding_batch_size
         self.logger = self._setup_logger(logger)
@@ -595,7 +589,6 @@ def __init__(
         cache_dir = config.cache_dir
         self.embedding_creator = EmbeddingCreator(
             embedding_model_name_or_path=config.embedding_model_name_or_path,
-            embedding_max_mem_gb=config.embedding_max_mem_gb,
             embedding_batch_size=config.embedding_batch_size,
             input_column=input_column,
             embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),

diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py
@@ -75,7 +75,6 @@ def main(args):
     # ddf = ddf.repartition(partition_size="64MB")
     embedding_creator = EmbeddingCreator(
         embedding_model_name_or_path=semdedup_config.embedding_model_name_or_path,
-        embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb,
         embedding_batch_size=semdedup_config.embedding_batch_size,
         embedding_output_dir=os.path.join(
             semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
@@ -110,8 +109,7 @@ def attach_args():
             " input_column for specifying the input column for embeddings,"
             " embeddings_save_loc for the location to save embeddings,"
             " embedding_model_name_or_path for the model name or path for embeddings,"
-            " embedding_batch_size for the batch size for processing embeddings,"
-            " embedding_max_mem_gb for the maximum memory in GB for embeddings"
+            " embedding_batch_size for the batch size for processing embeddings"
         ),
         add_input_args=True,
     )

diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py
@@ -134,8 +134,11 @@ def _update_filetype(file_set, old_file_type, new_file_type):
         new_file_type = "." + new_file_type
 
     updated_file_set = {
-        f"{os.path.splitext(file)[0]}{new_file_type}"
-        if file.endswith(old_file_type) else file
+        (
+            f"{os.path.splitext(file)[0]}{new_file_type}"
+            if file.endswith(old_file_type)
+            else file
+        )
         for file in file_set
     }
     return updated_file_set

diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
@@ -6,7 +6,6 @@ num_files: 16
 embeddings_save_loc: "embeddings"
 embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
 embedding_batch_size: 128
-embedding_max_mem_gb: 20
 
 # Clustering configuration
 clustering_save_loc: "clustering_results"