Skip to content

Commit

Permalink
Fix DAPT tutorial (#503)
Browse files Browse the repository at this point in the history
* quick fix

Signed-off-by: Sarah Yurick <[email protected]>

* fix typehint

Signed-off-by: Sarah Yurick <[email protected]>

* add write_embeddings_to_disk to config file

Signed-off-by: Sarah Yurick <[email protected]>

* minor fixes

Signed-off-by: Sarah Yurick <[email protected]>

* fix readme

Signed-off-by: Sarah Yurick <[email protected]>

---------

Signed-off-by: Sarah Yurick <[email protected]>
  • Loading branch information
sarahyurick authored Jan 31, 2025
1 parent fe41ac1 commit 75234a9
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 21 deletions.
1 change: 1 addition & 0 deletions config/sem_dedup_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
write_embeddings_to_disk: true

# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down
1 change: 1 addition & 0 deletions docs/user-guide/semdedup.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Semantic deduplication in NeMo Curator can be configured using a YAML file. Here
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
write_embeddings_to_disk: true
# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down
4 changes: 4 additions & 0 deletions nemo_curator/modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ class SemDedupConfig(BaseConfig):
embeddings_save_loc (str): Location to save embeddings.
embedding_model_name_or_path (str): Model name or path for embeddings.
embedding_batch_size (int): Inital Batch size for processing embeddings.
write_embeddings_to_disk (bool): If True, saves the embeddings to disk, defaults to True.
We recommend setting this to False when you have a delayed pipeline.
Setting it to False can lead to more memory overhead.
clustering_save_loc (str): Location to save clustering results.
n_clusters (int): Number of clusters.
seed (int): Seed for clustering.
Expand All @@ -165,6 +168,7 @@ class SemDedupConfig(BaseConfig):
embeddings_save_loc: str = "embeddings"
embedding_model_name_or_path: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: int = 128
write_embeddings_to_disk: bool = True

# Clustering config
clustering_save_loc: str = "clustering_results"
Expand Down
1 change: 1 addition & 0 deletions nemo_curator/modules/semantic_dedup/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset:
)
)
else:
embedding_ddf = self.create_embeddings(dataset.df, self.input_column)
ddf = DocumentDataset(embedding_ddf)

self.logger.info(
Expand Down
1 change: 1 addition & 0 deletions nemo_curator/modules/semantic_dedup/semdedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
embedding_batch_size=config.embedding_batch_size,
input_column=input_column,
embedding_output_dir=os.path.join(cache_dir, config.embeddings_save_loc),
write_embeddings_to_disk=config.write_embeddings_to_disk,
logger=logger,
profile_dir=self.config.profile_dir,
)
Expand Down
1 change: 1 addition & 0 deletions nemo_curator/scripts/semdedup/compute_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def main(args):
semdedup_config.cache_dir, semdedup_config.embeddings_save_loc
),
input_column=args.input_text_field,
write_embeddings_to_disk=semdedup_config.write_embeddings_to_disk,
logger=logger,
write_to_filename=True,
)
Expand Down
6 changes: 3 additions & 3 deletions tutorials/dapt-curation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ The tutorial follows the steps below:<br>
After installing the NeMo Curator package, install the dependencies and run:

```bash
pip install -r code/requirements.txt
cd code
python main.py
pip install -r requirements.txt
python main.py --device "gpu"
```

This will download chip-design related datasets and begin the data curation pipeline.
This will download chip-design related datasets and begin the data curation pipeline. Please use `--device "gpu"` to enable semantic and fuzzy deduplication, which require the GPU.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ num_files: 16
embeddings_save_loc: "embeddings"
embedding_model_name_or_path: "sentence-transformers/all-MiniLM-L6-v2"
embedding_batch_size: 128
write_embeddings_to_disk: false

# Clustering configuration
clustering_save_loc: "clustering_results"
Expand Down
7 changes: 2 additions & 5 deletions tutorials/dapt-curation/code/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,8 @@
)

import nemo_curator as nc
from nemo_curator import ExactDuplicates, Modify, ScoreFilter, Sequential
from nemo_curator import ScoreFilter, Sequential
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import RepeatingTopNGramsFilter, WordCountFilter
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.file_utils import (
get_all_files_paths_under,
Expand Down Expand Up @@ -191,7 +188,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
duplicates = semantic_dedupe(
dataset=gpu_dataset_text,
sem_dedupe_config_yaml_path=sem_dedupe_config_yaml_path,
cache=CACHE_DIR,
cache_dir=CACHE_DIR,
)
unique_ids = duplicates.df.to_backend("pandas").compute()["id"]
semantic_dataset_text = DocumentDataset(
Expand Down
14 changes: 1 addition & 13 deletions tutorials/dapt-curation/code/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
import re

import dask.dataframe as dd
import pandas as pd
import yaml

from nemo_curator import (
ExactDuplicates,
Expand All @@ -33,7 +27,6 @@
from nemo_curator.datasets import DocumentDataset
from nemo_curator.filters import (
DocumentFilter,
RepeatedLinesFilter,
RepeatedParagraphsFilter,
RepeatingTopNGramsFilter,
UrlsFilter,
Expand All @@ -46,12 +39,7 @@
from nemo_curator.modifiers import DocumentModifier
from nemo_curator.modifiers.pii_modifier import PiiModifier
from nemo_curator.modifiers.unicode_reformatter import UnicodeReformatter
from nemo_curator.pii.constants import DEFAULT_LANGUAGE, DEFAULT_MAX_DOC_SIZE
from nemo_curator.utils.distributed_utils import get_client
from nemo_curator.utils.file_utils import (
expand_outdir_and_mkdir,
get_all_files_paths_under,
)
from nemo_curator.utils.file_utils import expand_outdir_and_mkdir


class QuotationUnifier(DocumentModifier):
Expand Down

0 comments on commit 75234a9

Please sign in to comment.