From a218b09dc6ccfd779c2345d9a9205812058edb3f Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Wed, 30 Oct 2024 15:58:09 -0700
Subject: [PATCH] remove embedding_max_mem_gb

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 config/sem_dedup_config.yaml                             | 1 -
 docs/user-guide/semdedup.rst                             | 5 +----
 nemo_curator/modules/config.py                           | 2 --
 nemo_curator/modules/semantic_dedup.py                   | 9 +--------
 nemo_curator/scripts/semdedup/compute_embeddings.py      | 4 +---
 nemo_curator/utils/file_utils.py                         | 7 +++++--
 .../peft-curation-with-sdg/config/sem_dedup_config.yaml  | 1 -
 7 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/config/sem_dedup_config.yaml b/config/sem_dedup_config.yaml
index 75d1d656..39787d2f 100644
--- a/config/sem_dedup_config.yaml
+++ b/config/sem_dedup_config.yaml
@@ -6,7 +6,6 @@ num_files: 16
 embeddings_save_loc: "embeddings"
 embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
 embedding_batch_size: 128
-embedding_max_mem_gb: 25
 
 # Clustering configuration
 clustering_save_loc: "clustering_results"
diff --git a/docs/user-guide/semdedup.rst b/docs/user-guide/semdedup.rst
index 893f13cd..31ba6efd 100644
--- a/docs/user-guide/semdedup.rst
+++ b/docs/user-guide/semdedup.rst
@@ -45,7 +45,6 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
     embeddings_save_loc: "embeddings"
     embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
     embedding_batch_size: 128
-    embedding_max_mem_gb: 25
 
     # Clustering configuration
     clustering_save_loc: "clustering_results"
@@ -96,7 +95,7 @@ The module supports various types of models, including:
 When changing the model, ensure that:
 
 1. The model is compatible with the data type you're working with (primarily text for this module).
-2. You adjust the ``embedding_batch_size`` and ``embedding_max_mem_gb`` parameters as needed, as different models may have different memory requirements.
+2. You adjust the ``embedding_batch_size`` parameter as needed, as different models may have different memory requirements.
 3. The chosen model is appropriate for the language or domain of your dataset.
 
 By selecting an appropriate embedding model, you can optimize the semantic deduplication process for your specific use case and potentially improve the quality of the deduplicated dataset.
@@ -169,7 +168,6 @@ Use Individual Components
     # Step 1: Embedding Creation
     embedding_creator = EmbeddingCreator(
         embedding_model_name_or_path="path/to/pretrained/model",
-        embedding_max_mem_gb=32,
         embedding_batch_size=128,
         embedding_output_dir="path/to/output/embeddings",
         input_column="text",
@@ -252,7 +250,6 @@ Parameters
 Key parameters in the configuration file include:
 
 - ``embedding_model_name_or_path``: Path or identifier for the pre-trained model used for embedding generation.
-- ``embedding_max_mem_gb``: Maximum memory usage for the embedding process.
 - ``embedding_batch_size``: Number of samples to process in each embedding batch.
 - ``n_clusters``: Number of clusters for k-means clustering.
 - ``eps_to_extract``: Deduplication threshold. Higher values result in more aggressive deduplication.
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
index 048a1c5a..551f261e 100644
--- a/nemo_curator/modules/config.py
+++ b/nemo_curator/modules/config.py
@@ -123,7 +123,6 @@ class SemDedupConfig(BaseConfig):
         embeddings_save_loc (str): Location to save embeddings.
         embedding_model_name_or_path (str): Model name or path for embeddings.
         embedding_batch_size (int): Inital Batch size for processing embeddings.
-        embedding_max_mem_gb (int): Maximum memory in GB for embeddings.
         clustering_save_loc (str): Location to save clustering results.
         n_clusters (int): Number of clusters.
         seed (int): Seed for clustering.
@@ -144,7 +143,6 @@ class SemDedupConfig(BaseConfig):
     embeddings_save_loc: str = "embeddings"
     embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
     embedding_batch_size: int = 128
-    embedding_max_mem_gb: int = 25
 
     # Clustering config
     clustering_save_loc: str = "clustering_results"
diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py
index 5303d20e..0056cad6 100644
--- a/nemo_curator/modules/semantic_dedup.py
+++ b/nemo_curator/modules/semantic_dedup.py
@@ -53,7 +53,6 @@
 @dataclass
 class EmbeddingConfig:
     model_name_or_path: str
-    max_mem_gb: int
     max_seq_length: int = None
 
     def __post_init__(self):
@@ -99,9 +98,7 @@ def _mean_pooling(self, model_output, attention_mask):
 class EmbeddingCrossFitModel(HFModel):
     def __init__(self, config: EmbeddingConfig):
         self.config = config
-        super().__init__(
-            self.config.model_name_or_path, max_mem_gb=self.config.max_mem_gb
-        )
+        super().__init__(self.config.model_name_or_path)
 
     def load_model(self, device="cuda"):
         model = EmbeddingPytorchModel(self.config)
@@ -123,7 +120,6 @@ class EmbeddingCreator:
     def __init__(
         self,
         embedding_model_name_or_path: str,
-        embedding_max_mem_gb: str,
         embedding_batch_size: int,
         embedding_output_dir: str,
         input_column: str = "text",
@@ -138,7 +134,6 @@ def __init__(
 
         Args:
             embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings.
-            embedding_max_mem_gb (str): Maximum memory usage for the embedding process.
             embedding_batch_size (int): Number of samples to process in each batch.
             embedding_output_dir (str): Directory path where embeddings will be saved.
             input_column (str): Column name from the data to be used for embedding generation, defaults to "text".
@@ -161,7 +156,6 @@ def __init__(
 
         self.embeddings_config = EmbeddingConfig(
             model_name_or_path=embedding_model_name_or_path,
-            max_mem_gb=embedding_max_mem_gb,
         )
         self.batch_size = embedding_batch_size
         self.logger = self._setup_logger(logger)
@@ -595,7 +589,6 @@ def __init__(
         cache_dir = config.cache_dir
         self.embedding_creator = EmbeddingCreator(
             embedding_model_name_or_path=config.embedding_model_name_or_path,
-            embedding_max_mem_gb=config.embedding_max_mem_gb,
             embedding_batch_size=config.embedding_batch_size,
             input_column=input_column,
             embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
diff --git a/nemo_curator/scripts/semdedup/compute_embeddings.py b/nemo_curator/scripts/semdedup/compute_embeddings.py
index af5c8655..a50cf47b 100644
--- a/nemo_curator/scripts/semdedup/compute_embeddings.py
+++ b/nemo_curator/scripts/semdedup/compute_embeddings.py
@@ -75,7 +75,6 @@ def main(args):
     # ddf = ddf.repartition(partition_size="64MB")
     embedding_creator = EmbeddingCreator(
         embedding_model_name_or_path=semdedup_config.embedding_model_name_or_path,
-        embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb,
         embedding_batch_size=semdedup_config.embedding_batch_size,
         embedding_output_dir=os.path.join(
             semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
@@ -110,8 +109,7 @@ def attach_args():
             " input_column for specifying the input column for embeddings,"
             " embeddings_save_loc for the location to save embeddings,"
             " embedding_model_name_or_path for the model name or path for embeddings,"
-            " embedding_batch_size for the batch size for processing embeddings,"
-            " embedding_max_mem_gb for the maximum memory in GB for embeddings"
+            " embedding_batch_size for the batch size for processing embeddings"
         ),
         add_input_args=True,
     )
diff --git a/nemo_curator/utils/file_utils.py b/nemo_curator/utils/file_utils.py
index 55364f12..4bc6120e 100644
--- a/nemo_curator/utils/file_utils.py
+++ b/nemo_curator/utils/file_utils.py
@@ -134,8 +134,11 @@ def _update_filetype(file_set, old_file_type, new_file_type):
         new_file_type = "." + new_file_type
 
     updated_file_set = {
-        f"{os.path.splitext(file)[0]}{new_file_type}"
-        if file.endswith(old_file_type) else file
+        (
+            f"{os.path.splitext(file)[0]}{new_file_type}"
+            if file.endswith(old_file_type)
+            else file
+        )
         for file in file_set
     }
     return updated_file_set
diff --git a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
index 205d5fcd..93ec29cb 100644
--- a/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
+++ b/tutorials/peft-curation-with-sdg/config/sem_dedup_config.yaml
@@ -6,7 +6,6 @@ num_files: 16
 embeddings_save_loc: "embeddings"
 embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
 embedding_batch_size: 128
-embedding_max_mem_gb: 20
 
 # Clustering configuration
 clustering_save_loc: "clustering_results"