From fe41ac1d3b7fb68853d0fa653829611b6690d204 Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Tue, 28 Jan 2025 11:39:48 -0800
Subject: [PATCH] Update fuzzy deduplication to skip false positive checks as
 the default (#498)

* Update no-fp check defaults

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* remove outdated cli docs in favor of user guide docs

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Add/update tests

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

* Apply suggestions from code review

Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>

---------

Signed-off-by: Ayush Dattagupta <ayushdg95@gmail.com>
Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com>
---
 examples/fuzzy_deduplication.py               |  8 +-
 nemo_curator/modules/config.py                | 76 +++++++++------
 nemo_curator/modules/fuzzy_dedup/minhash.py   |  2 +-
 .../scripts/fuzzy_deduplication/README.md     | 96 +------------------
 .../fuzzy_deduplication/compute_minhashes.py  |  2 +-
 tests/test_fuzzy_dedup.py                     | 37 +++++--
 6 files changed, 84 insertions(+), 137 deletions(-)

diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py
index b7da2470..51344ccb 100644
--- a/examples/fuzzy_deduplication.py
+++ b/examples/fuzzy_deduplication.py
@@ -69,14 +69,12 @@ def main(args):
             id_field=dataset_id_field,
             text_field=dataset_text_field,
             seed=42,
-            char_ngrams=5,
+            char_ngrams=24,
             num_buckets=20,
             hashes_per_bucket=13,
             use_64_bit_hash=False,
-            buckets_per_shuffle=5,
-            false_positive_check=True,
-            num_anchors=2,
-            jaccard_threshold=0.8,
+            buckets_per_shuffle=5,  # set to a smaller value if encountering OOMs during LSH
+            false_positive_check=False,
         )
         fuzzy_dup = FuzzyDuplicates(logger=log_dir, config=fuzzy_dedup_config)
         duplicates = fuzzy_dup(dataset=input_dataset)
diff --git a/nemo_curator/modules/config.py b/nemo_curator/modules/config.py
index 551f261e..b43eb8fd 100644
--- a/nemo_curator/modules/config.py
+++ b/nemo_curator/modules/config.py
@@ -67,47 +67,69 @@ class FuzzyDuplicatesConfig(BaseConfig):
 
     # Minhash + LSH Config
     seed: int = 42
-    char_ngrams: int = 5
+    char_ngrams: int = 24
     num_buckets: int = 20
     hashes_per_bucket: int = 13
     use_64_bit_hash: bool = False
     buckets_per_shuffle: int = 1
 
-    false_positive_check: bool = True
-    # Only required for fp check
-    num_anchors: int = 2
-    jaccard_threshold: float = 0.8
-    bucket_mapping_blocksize: int = 256
-    parts_per_worker: int = 1
-    bucket_parts_per_worker: int = 8
+    false_positive_check: bool = False
+    # Only required for false positive check
+    num_anchors: Optional[int] = None
+    jaccard_threshold: Optional[float] = None
+    bucket_mapping_blocksize: Optional[int] = None
+    parts_per_worker: Optional[int] = None
+    bucket_parts_per_worker: Optional[int] = None
 
     def __post_init__(self):
         self.num_hashes = self.num_buckets * self.hashes_per_bucket
-        if self.cache_dir is None:
-            raise ValueError(
-                "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
-            )
+        false_positive_defaults = {
+            "num_anchors": 2,
+            "jaccard_threshold": 0.8,
+            "bucket_mapping_blocksize": 256,
+            "parts_per_worker": 1,
+            "bucket_parts_per_worker": 8,
+        }
         if self.false_positive_check:
             warnings.warn(
                 "Identifying false positives during the Minhash deduplication is computationally expensive."
                 " For improved performance consider setting this to False"
             )
-        if not self.false_positive_check and self.char_ngrams < 20:
-            warnings.warn(
-                "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
-                " Using a value of at least 20 for char_ngrams is recommended."
-            )
-        if self.num_anchors <= 0:
-            raise ValueError("Number of anchors must be greater than 0")
-        if self.num_anchors > 2:
-            warnings.warn(
-                "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
-                category=UserWarning,
+            for arg, default in false_positive_defaults.items():
+                if getattr(self, arg) is None:
+                    setattr(self, arg, default)
+            if self.num_anchors <= 0:
+                raise ValueError("Number of anchors must be greater than 0")
+            if self.num_anchors > 2:
+                warnings.warn(
+                    "Using a higher number of anchor docs might lead to higher memory footprint and might impact performance",
+                    category=UserWarning,
+                )
+            if not 0 <= self.jaccard_threshold <= 1:
+                raise ValueError("Jaccard Threshold must be between [0,1]")
+        else:
+            if self.char_ngrams < 20:
+                warnings.warn(
+                    "Using a small char_ngrams value might lead to a large number (~5%) of false positives during deduplication."
+                    " Using a value of at least 20 for char_ngrams is recommended."
+                )
+            unused_false_positive_args = [
+                arg
+                for arg in false_positive_defaults.keys()
+                if getattr(self, arg) is not None
+            ]
+            if unused_false_positive_args:
+                warnings.warn(
+                    f"False positive check is disabled. Unused arguments {unused_false_positive_args} will be ignored",
+                    category=UserWarning,
+                )
+
+        if self.cache_dir is None:
+            raise ValueError(
+                "Finding fuzzy duplicates requires a cache directory accessible via all workers to store intermediates"
             )
-        if not 0 <= self.jaccard_threshold <= 1:
-            raise ValueError("Jaccard Threshold must be between [0,1]")
-        if self.buckets_per_shuffle <= 0:
-            raise ValueError("Buckets per shuffle must be greater than 0")
+        if not 1 <= self.buckets_per_shuffle <= self.num_buckets:
+            raise ValueError("Buckets per shuffle must be between [1, num_buckets]")
 
 
 @dataclass
diff --git a/nemo_curator/modules/fuzzy_dedup/minhash.py b/nemo_curator/modules/fuzzy_dedup/minhash.py
index b38b2268..28fa9aca 100644
--- a/nemo_curator/modules/fuzzy_dedup/minhash.py
+++ b/nemo_curator/modules/fuzzy_dedup/minhash.py
@@ -39,7 +39,7 @@ def __init__(
         self,
         seed: int = 42,
         num_hashes: int = 260,
-        char_ngrams: int = 5,
+        char_ngrams: int = 24,
         use_64bit_hash: bool = False,
         logger: Union[logging.LoggerAdapter, str] = "./",
         id_field: str = "id",
diff --git a/nemo_curator/scripts/fuzzy_deduplication/README.md b/nemo_curator/scripts/fuzzy_deduplication/README.md
index f5a43f40..63dcdb5c 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/README.md
+++ b/nemo_curator/scripts/fuzzy_deduplication/README.md
@@ -2,98 +2,4 @@
 This directory consists of scripts that can be invoked directly via the command line for finding fuzzy duplicates from a group of Jsonl files consisting of text & unique ID's that are specifically formatted using the `add_id` script included as a part of NeMo-Curator.
 
 > [!IMPORTANT]
-> The scripts are helper utilities that wrap the fuzzy_dedup API for handling multiple jsonl directories and the id format generated by [add_id](../add_id.py). For most cases we recommend working with the fuzzy_deduplication API directly.
-
-### Usage
-1. Compute Minhashes
-    - Input: Data Directories
-    -  Output: minhashes.parquet for each data dir.
-    - Example call:
-    ```bash
-        # same as `python compute_minhashes.py`
-        gpu_compute_minhashes \
-          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
-          --output-minhash-dir /path/to/output_minhashes \
-          --input-json-text-field text_column_name \
-          --input-json-id-field id_column_name \
-          --minhash-length number_of_hashes \
-          --char-ngram char_ngram_size \
-          --hash-bytes 4(or 8 byte hashes) \
-          --seed 42 \
-          --log-dir ./
-          # --scheduler-file /path/to/file.json
-    ```
-2. Buckets (Minhash Buckets)
-    - Input: Minhash directories
-    - Output: Buckets.parquet
-    - Example call:
-    ```bash
-        # same as `python minhash_lsh.py`
-        minhash_buckets \
-          --input-data-dirs /path/to/output_minhashes/dir1 /path/to/output_minhashes/dir2 \
-          --output-bucket-dir /path/to/dedup_output \
-          --input-minhash-field _minhash_signature \
-          --input-json-id-field id_column_name \
-          --minhash-length number_of_hashes \
-          --num-bands num_bands \
-          --buckets-per-shuffle 1 `#Value b/w [1-num_bands]. Higher is better but might lead to oom` \
-          --log-dir ./
-          # --scheduler-file /path/to/file.json
-    ```
-3. Jaccard Map Buckets
-    - Input: Buckets.parquet + Data Dir
-    - Output: anchor_docs_with_bk.parquet
-    - Example call:
-    ```bash
-        # same as `python map_buckets.py`
-        jaccard_map_buckets \
-          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
-          --input-bucket-dir /path/to/dedup_output/_buckets.parquet \
-          --output-dir /path/to/dedup_output \
-          --input-json-text-field text_column_name \
-          --input-json-id-field id_column_name \
-          # --scheduler-file /path/to/file.json
-    ```
-4. Jaccard Shuffle
-    - Input: anchor_docs_with_bk.parquet + Data Dir
-    - Output: shuffled_docs.parquet
-    - Example call:
-    ```bash
-        # same as `python jaccard_shuffle.py`
-        jaccard_shuffle \
-          --input-data-dirs /path/to/jsonl/dir1 /path/to/jsonl/dir2 \
-          --input-bucket-mapping-dir /path/to/dedup_output/anchor_docs_with_bk.parquet \
-          --output-dir /path/to/dedup_output \
-          --input-json-text-field text_column_name \
-          --input-json-id-field id_column_name \
-          # --scheduler-file /path/to/file.json
-    ```
-5. Jaccard compute
-    - Input: Shuffled docs.parquet
-    - Output: jaccard_similarity_results.parquet
-    - Example call:
-    ```bash
-        # same as `python jaccard_compute.py`
-        jaccard_compute \
-          --shuffled-docs-path /path/to/dedup_output/shuffled_docs.parquet \
-          --output-dir /path/to/dedup_output \
-          --ngram-size char_ngram_size_for_similarity \
-          # --scheduler-file /path/to/file.json
-    ```
-6. Connected Components
-    - Input: jaccard_similarity_results.parquet
-    - Output: connected_components.parquet
-    - Example call:
-    ```bash
-        # same as `python connected_components.py`
-        gpu_connected_component \
-          --jaccard-pairs_path /path/to/dedup_output/jaccard_similarity_results.parquet \
-          --output-dir /path/to/dedup_output \
-          --cache-dir /path/to/cc_cache \
-          --jaccard-threshold 0.8
-          # --scheduler-file /path/to/file.json
-    ```
-
-> [!TIP]
-> When using these scripts in a multi-node environment (like Slurm, K8's etc.) it is recommended to start up a Dask cluster prior to execution and connect to the existing cluster via the `--scheduler-address` or `--scheduler-file` flag.
-> Use the `--help` flag to view all possible CLI options for the scripts and details on what they do.
+> The up to date documentation on running the fuzzy deduplication scripts can be found in the [NeMo Curator User Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/datacuration/gpudeduplication.html#id4). It is recommended to use the Python API directly rather than the CLI scripts for most cases.
diff --git a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
index aa4e1f63..2a5b9d3b 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/compute_minhashes.py
@@ -129,7 +129,7 @@ def attach_args():
     parser.add_argument(
         "--char-ngram",
         type=int,
-        default=5,
+        default=24,
         help="The number of consecutive characters to include in a sliding "
         "window when creating the document shingles for computing "
         "minhash signatures.",
diff --git a/tests/test_fuzzy_dedup.py b/tests/test_fuzzy_dedup.py
index 8b845394..e62ab91b 100644
--- a/tests/test_fuzzy_dedup.py
+++ b/tests/test_fuzzy_dedup.py
@@ -341,7 +341,7 @@ def test_fuzzy_dedup(
             num_buckets=num_buckets,
             hashes_per_bucket=1,
             use_64_bit_hash=use_64_bit_hash,
-            buckets_per_shuffle=5,
+            buckets_per_shuffle=3,
             false_positive_check=True,
             num_anchors=2,
             jaccard_threshold=jaccard_threshold,
@@ -375,6 +375,7 @@ def test_different_fields(self, fuzzy_dedup_data, tmpdir):
             false_positive_check=True,
             num_anchors=2,
             jaccard_threshold=0.39,
+            char_ngrams=5,
         )
         fuzzy_duplicates = FuzzyDuplicates(config=config)
         result = fuzzy_duplicates(fuzzy_dedup_data)
@@ -487,7 +488,7 @@ def test_no_fp_check(
             num_buckets=num_buckets,
             hashes_per_bucket=1,
             use_64_bit_hash=use_64_bit_hash,
-            buckets_per_shuffle=5,
+            buckets_per_shuffle=3,
             false_positive_check=False,
             num_anchors=2,
             jaccard_threshold=0.39,
@@ -575,11 +576,25 @@ def test_fuzzy_dedup_no_duplicates(
 class TestFuzzyDuplicatesConfig:
     def test_bad_inputs(self, tmpdir):
         with pytest.raises(ValueError):
-            FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=0)
+            FuzzyDuplicatesConfig(
+                cache_dir=tmpdir, num_anchors=0, false_positive_check=True
+            )
+            FuzzyDuplicatesConfig(
+                cache_dir=tmpdir, jaccard_threshold=1.2, false_positive_check=True
+            )
+            FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0)
+            FuzzyDuplicatesConfig(
+                cache_dir=tmpdir, buckets_per_shuffle=2, num_buckets=1
+            )
+            FuzzyDuplicatesConfig(
+                cache_dir=None, num_anchors=0, false_positive_check=True
+            )
         with pytest.warns(
             UserWarning, match="Using a higher number of anchor docs might"
         ):
-            FuzzyDuplicatesConfig(cache_dir=tmpdir, num_anchors=3)
+            FuzzyDuplicatesConfig(
+                cache_dir=tmpdir, num_anchors=3, false_positive_check=True
+            )
         with pytest.warns(
             UserWarning, match="Using a small char_ngrams value might lead"
         ):
@@ -591,10 +606,16 @@ def test_bad_inputs(self, tmpdir):
             match="Identifying false positives during the Minhash deduplication is computationally expensive",
         ):
             FuzzyDuplicatesConfig(cache_dir=tmpdir, false_positive_check=True)
-        with pytest.raises(ValueError):
-            FuzzyDuplicatesConfig(cache_dir=tmpdir, jaccard_threshold=1.2)
-        with pytest.raises(ValueError):
-            FuzzyDuplicatesConfig(cache_dir=tmpdir, buckets_per_shuffle=0)
+        with pytest.warns(
+            UserWarning,
+            match="False positive check is disabled. Unused arguments",
+        ):
+            FuzzyDuplicatesConfig(
+                cache_dir=tmpdir,
+                false_positive_check=False,
+                num_anchors=2,
+                jaccard_threshold=0.8,
+            )
 
     def test_from_yaml(self, tmpdir):
         yaml_params = {