-
Notifications
You must be signed in to change notification settings - Fork 112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fuzzy Dedup: Make skipping the False positive check the default #386
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -67,47 +67,67 @@ class FuzzyDuplicatesConfig(BaseConfig): | |||||
|
||||||
# Minhash + LSH Config | ||||||
seed: int = 42 | ||||||
char_ngrams: int = 5 | ||||||
char_ngrams: int = 24 | ||||||
num_buckets: int = 20 | ||||||
hashes_per_bucket: int = 13 | ||||||
use_64_bit_hash: bool = False | ||||||
buckets_per_shuffle: int = 1 | ||||||
|
||||||
false_positive_check: bool = True | ||||||
false_positive_check: bool = False | ||||||
# Only required for fp check | ||||||
num_anchors: int = 2 | ||||||
jaccard_threshold: float = 0.8 | ||||||
bucket_mapping_blocksize: int = 256 | ||||||
parts_per_worker: int = 1 | ||||||
bucket_parts_per_worker: int = 8 | ||||||
num_anchors: Optional[int] = None | ||||||
jaccard_threshold: Optional[float] = None | ||||||
bucket_mapping_blocksize: Optional[int] = None | ||||||
parts_per_worker: Optional[int] = None | ||||||
bucket_parts_per_worker: Optional[int] = None | ||||||
|
||||||
def __post_init__(self): | ||||||
self.num_hashes = self.num_buckets * self.hashes_per_bucket | ||||||
if self.cache_dir is None: | ||||||
raise ValueError( | ||||||
"Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates" | ||||||
) | ||||||
fp_defaults = { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
"num_anchors": 2, | ||||||
"jaccard_threshold": 0.8, | ||||||
"bucket_mapping_blocksize": 256, | ||||||
"parts_per_worker": 1, | ||||||
"bucket_parts_per_worker": 8, | ||||||
} | ||||||
if self.false_positive_check: | ||||||
warnings.warn( | ||||||
"Identifying false positives during the Minhash deduplication is computationally expensive." | ||||||
" For improved performance consider setting this to False" | ||||||
) | ||||||
if not self.false_positive_check and self.char_ngrams < 20: | ||||||
warnings.warn( | ||||||
"Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication." | ||||||
" Using a value of at least 20 for char_ngrams is recommended." | ||||||
) | ||||||
if self.num_anchors <= 0: | ||||||
raise ValueError("Number of anchors must be greater than 0") | ||||||
if self.num_anchors > 2: | ||||||
warnings.warn( | ||||||
"Using a higher number of anchor docs might lead to higher memory footprint and might impact performance", | ||||||
category=UserWarning, | ||||||
for arg, default in fp_defaults.items(): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if getattr(self, arg) is None: | ||||||
setattr(self, arg, default) | ||||||
if self.num_anchors <= 0: | ||||||
raise ValueError("Number of anchors must be greater than 0") | ||||||
if self.num_anchors > 2: | ||||||
warnings.warn( | ||||||
"Using a higher number of anchor docs might lead to higher memory footprint and might impact performance", | ||||||
category=UserWarning, | ||||||
) | ||||||
if not 0 <= self.jaccard_threshold <= 1: | ||||||
raise ValueError("Jaccard Threshold must be between [0,1]") | ||||||
else: | ||||||
if self.char_ngrams < 20: | ||||||
warnings.warn( | ||||||
"Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication." | ||||||
" Using a value of at least 20 for char_ngrams is recommended." | ||||||
) | ||||||
unused_fp_args = [ | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
arg for arg in fp_defaults.keys() if getattr(self, arg) is not None | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
] | ||||||
if unused_fp_args: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
warnings.warn( | ||||||
f"False positive check is disabled. Unused arguments {unused_fp_args} will be ignored", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
category=UserWarning, | ||||||
) | ||||||
|
||||||
if self.cache_dir is None: | ||||||
raise ValueError( | ||||||
"Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates" | ||||||
) | ||||||
if not 0 <= self.jaccard_threshold <= 1: | ||||||
raise ValueError("Jaccard Threshold must be between [0,1]") | ||||||
if self.buckets_per_shuffle <= 0: | ||||||
raise ValueError("Buckets per shuffle must be greater than 0") | ||||||
if not 1 <= self.buckets_per_shuffle <= self.num_buckets: | ||||||
raise ValueError("Buckets per shuffle must be between [1, num_buckets]") | ||||||
|
||||||
|
||||||
@dataclass | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,99 +1,2 @@ | ||||||
## Fuzzy Deduplication Steps | ||||||
This directory consists of scripts that can be invoked directly via the command line for finding fuzzy duplicates from a group of Jsonl files consisting of text & unique ID's that are specifically formatted using the `add_id` script included as a part of NeMo-Curator. | ||||||
|
||||||
> [!IMPORTANT] | ||||||
> The scripts are helper utilities that wrap the fuzzy_dedup API for handling multiple jsonl directories and the id format generated by [add_id](../add_id.py). For most cases we recommend working with the fuzzy_deduplication API directly. | ||||||
|
||||||
### Usage | ||||||
1. Compute Minhashes | ||||||
- Input: Data Directories | ||||||
- Output: minhashes.parquet for each data dir. | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python compute_minhashes.py` | ||||||
gpu_compute_minhashes \ | ||||||
--input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ | ||||||
--output-minhash-dir /path/to/output_minhashes \ | ||||||
--input-json-text-field text_column_name \ | ||||||
--input-json-id-field id_column_name \ | ||||||
--minhash-length number_of_hashes \ | ||||||
--char-ngram char_ngram_size \ | ||||||
--hash-bytes 4(or 8 byte hashes) \ | ||||||
--seed 42 \ | ||||||
--log-dir ./ | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
2. Buckets (Minhash Buckets) | ||||||
- Input: Minhash directories | ||||||
- Output: Buckets.parquet | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python minhash_lsh.py` | ||||||
minhash_buckets \ | ||||||
--input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \ | ||||||
--output-bucket-dir /path/to/dedup_output \ | ||||||
--input-minhash-field _minhash_signature \ | ||||||
--input-json-id-field id_column_name \ | ||||||
--minhash-length number_of_hashes \ | ||||||
--num-bands num_bands \ | ||||||
--buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \ | ||||||
--log-dir ./ | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
3. Jaccard Map Buckets | ||||||
- Input: Buckets.parquet + Data Dir | ||||||
- Output: anchor_docs_with_bk.parquet | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python map_buckets.py` | ||||||
jaccard_map_buckets \ | ||||||
--input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ | ||||||
--input-bucket-dir /path/to/dedup_output/_buckets.parquet \ | ||||||
--output-dir /path/to/dedup_output \ | ||||||
--input-json-text-field text_column_name \ | ||||||
--input-json-id-field id_column_name \ | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
4. Jaccard Shuffle | ||||||
- Input: anchor_docs_with_bk.parquet + Data Dir | ||||||
- Output: shuffled_docs.parquet | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python jaccard_shuffle.py` | ||||||
jaccard_shuffle \ | ||||||
--input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \ | ||||||
--input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \ | ||||||
--output-dir /path/to/dedup_output \ | ||||||
--input-json-text-field text_column_name \ | ||||||
--input-json-id-field id_column_name \ | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
5. Jaccard compute | ||||||
- Input: Shuffled docs.parquet | ||||||
- Output: jaccard_similarity_results.parquet | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python jaccard_compute.py` | ||||||
jaccard_compute \ | ||||||
--shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \ | ||||||
--output-dir /path/to/dedup_output \ | ||||||
--ngram-size char_ngram_size_for_similarity \ | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
6. Connected Components | ||||||
- Input: jaccard_similarity_results.parquet | ||||||
- Output: connected_components.parquet | ||||||
- Example call: | ||||||
```bash | ||||||
# same as `python connected_components.py` | ||||||
gpu_connected_component \ | ||||||
--jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \ | ||||||
--output-dir /path/to/dedup_output \ | ||||||
--cache-dir /path/to/cc_cache \ | ||||||
--jaccard-threshold 0.8 | ||||||
# --scheduler-file /path/to/file.json | ||||||
``` | ||||||
|
||||||
> [!TIP] | ||||||
> When using these scripts in a multi-node environment (like Slurm, K8's etc.) it is recommended to start up a Dask cluster prior to execution and connect to the existing cluster via the `--scheduler-address` or `--scheduler-file` flag. | ||||||
> Use the `--help` flag to view all possible CLI options for the scripts and details on what they do. | ||||||
> The up to date documentation on running the Fuzzy Deduplication scripts can be found in the [NeMo-User guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/gpudeduplication.html#id4). It is recommended to use the Python API directly rather than the CLI scripts for most cases. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit, but I prefer spelling out
false_positive
each time. Going to be making a similar comment about "fpp" on #285 but I think it can get pretty difficult to modify our codebase otherwise.