Skip to content

Commit

Permalink
remove embedding_max_mem_gb
Browse files Browse the repository at this point in the history
Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick committed Oct 30, 2024
1 parent 579ce27 commit a218b09
Show file tree
Hide file tree
Showing 7 changed files with 8 additions and 21 deletions.
1 change: 0 additions & 1 deletion config/sem_dedup_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
embedding_max_mem_gb: 25

# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down
5 changes: 1 addition & 4 deletions docs/user-guide/semdedup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
embedding_max_mem_gb: 25
# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down Expand Up @@ -96,7 +95,7 @@ The module supports various types of models, including:
When changing the model, ensure that:

1. The model is compatible with the data type you're working with (primarily text for this module).
2. You adjust the ``embedding_batch_size`` and ``embedding_max_mem_gb`` parameters as needed, as different models may have different memory requirements.
2. You adjust the ``embedding_batch_size`` parameter as needed, as different models may have different memory requirements.
3. The chosen model is appropriate for the language or domain of your dataset.

By selecting an appropriate embedding model, you can optimize the semantic deduplication process for your specific use case and potentially improve the quality of the deduplicated dataset.
Expand Down Expand Up @@ -169,7 +168,6 @@ Use Individual Components
# Step 1: Embedding Creation
embedding_creator = EmbeddingCreator(
embedding_model_name_or_path="path/to/pretrained/model",
embedding_max_mem_gb=32,
embedding_batch_size=128,
embedding_output_dir="path/to/output/embeddings",
input_column="text",
Expand Down Expand Up @@ -252,7 +250,6 @@ Parameters
Key parameters in the configuration file include:

- ``embedding_model_name_or_path``: Path or identifier for the pre-trained model used for embedding generation.
- ``embedding_max_mem_gb``: Maximum memory usage for the embedding process.
- ``embedding_batch_size``: Number of samples to process in each embedding batch.
- ``n_clusters``: Number of clusters for k-means clustering.
- ``eps_to_extract``: Deduplication threshold. Higher values result in more aggressive deduplication.
Expand Down
2 changes: 0 additions & 2 deletions nemo_curator/modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ class SemDedupConfig(BaseConfig):
embeddings_save_loc (str): Location to save embeddings.
embedding_model_name_or_path (str): Model name or path for embeddings.
embedding_batch_size (int): Inital Batch size for processing embeddings.
embedding_max_mem_gb (int): Maximum memory in GB for embeddings.
clustering_save_loc (str): Location to save clustering results.
n_clusters (int): Number of clusters.
seed (int): Seed for clustering.
Expand All @@ -144,7 +143,6 @@ class SemDedupConfig(BaseConfig):
embeddings_save_loc: str = "embeddings"
embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: int = 128
embedding_max_mem_gb: int = 25

# Clustering config
clustering_save_loc: str = "clustering_results"
Expand Down
9 changes: 1 addition & 8 deletions nemo_curator/modules/semantic_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
@dataclass
class EmbeddingConfig:
model_name_or_path: str
max_mem_gb: int
max_seq_length: int = None

def __post_init__(self):
Expand Down Expand Up @@ -99,9 +98,7 @@ def _mean_pooling(self, model_output, attention_mask):
class EmbeddingCrossFitModel(HFModel):
def __init__(self, config: EmbeddingConfig):
self.config = config
super().__init__(
self.config.model_name_or_path, max_mem_gb=self.config.max_mem_gb
)
super().__init__(self.config.model_name_or_path)

def load_model(self, device="cuda"):
model = EmbeddingPytorchModel(self.config)
Expand All @@ -123,7 +120,6 @@ class EmbeddingCreator:
def __init__(
self,
embedding_model_name_or_path: str,
embedding_max_mem_gb: str,
embedding_batch_size: int,
embedding_output_dir: str,
input_column: str = "text",
Expand All @@ -138,7 +134,6 @@ def __init__(
Args:
embedding_model_name_or_path (str): The path or identifier for the model used to generate embeddings.
embedding_max_mem_gb (str): Maximum memory usage for the embedding process.
embedding_batch_size (int): Number of samples to process in each batch.
embedding_output_dir (str): Directory path where embeddings will be saved.
input_column (str): Column name from the data to be used for embedding generation, defaults to "text".
Expand All @@ -161,7 +156,6 @@ def __init__(

self.embeddings_config = EmbeddingConfig(
model_name_or_path=embedding_model_name_or_path,
max_mem_gb=embedding_max_mem_gb,
)
self.batch_size = embedding_batch_size
self.logger = self._setup_logger(logger)
Expand Down Expand Up @@ -595,7 +589,6 @@ def __init__(
cache_dir = config.cache_dir
self.embedding_creator = EmbeddingCreator(
embedding_model_name_or_path=config.embedding_model_name_or_path,
embedding_max_mem_gb=config.embedding_max_mem_gb,
embedding_batch_size=config.embedding_batch_size,
input_column=input_column,
embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
Expand Down
4 changes: 1 addition & 3 deletions nemo_curator/scripts/semdedup/compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ def main(args):
# ddf = ddf.repartition(partition_size="64MB")
embedding_creator = EmbeddingCreator(
embedding_model_name_or_path=semdedup_config.embedding_model_name_or_path,
embedding_max_mem_gb=semdedup_config.embedding_max_mem_gb,
embedding_batch_size=semdedup_config.embedding_batch_size,
embedding_output_dir=os.path.join(
semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
Expand Down Expand Up @@ -110,8 +109,7 @@ def attach_args():
" input_column for specifying the input column for embeddings,"
" embeddings_save_loc for the location to save embeddings,"
" embedding_model_name_or_path for the model name or path for embeddings,"
" embedding_batch_size for the batch size for processing embeddings,"
" embedding_max_mem_gb for the maximum memory in GB for embeddings"
" embedding_batch_size for the batch size for processing embeddings"
),
add_input_args=True,
)
Expand Down
7 changes: 5 additions & 2 deletions nemo_curator/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,11 @@ def _update_filetype(file_set, old_file_type, new_file_type):
new_file_type = "." + new_file_type

updated_file_set = {
f"{os.path.splitext(file)[0]}{new_file_type}"
if file.endswith(old_file_type) else file
(
f"{os.path.splitext(file)[0]}{new_file_type}"
if file.endswith(old_file_type)
else file
)
for file in file_set
}
return updated_file_set
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
embedding_max_mem_gb: 20

# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down

0 comments on commit a218b09

Please sign in to comment.