Merge pull request NVIDIA#229 from sarahyurick/adlr_id

Clean up ID logic
sarahyurick · Sep 9, 2024 · 658d397 · 658d397
2 parents b1562bf + 59720ab
commit 658d397
Show file tree

Hide file tree

Showing 7 changed files with 10 additions and 9 deletions.
diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst
@@ -41,10 +41,11 @@ This can be accomplished using the :code:`add_id` module within the NeMo Curator
 .. code-block:: bash
 
          add_id \
+           --id-field-name="my_id" \
            --input-data-dir=<Path to directory containing jsonl files> \
            --log-dir=./log/add_id
 
-By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001".
+This will create a new field named :code:`my_id` within each json document which will have the form "doc_prefix-000001".
 If the dataset already has a unique ID this step can be skipped.
 
 **Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the

diff --git a/docs/user-guide/kubernetescurator.rst b/docs/user-guide/kubernetescurator.rst
@@ -243,7 +243,7 @@ At this point you can tail the logs and look for ``Finished!`` in ``/nemo-worksp
 
     # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31
     # Computing hashes for /nemo-workspace/my_dataset
-    #                  adlr_id                           _hashes
+    #                       id                           _hashes
     # 0  cc-2023-14-0397113620  91b77eae49c10a65d485ac8ca18d6c43
     # 1  cc-2023-14-0397113621  a266f0794cc8ffbd431823e6930e4f80
     # 2  cc-2023-14-0397113622  baee533e2eddae764de2cd6faaa1286c

diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py
@@ -88,9 +88,9 @@ def attach_args(
     parser.add_argument(
         "--id-field-name",
         type=str,
-        default="adlr_id",
+        required=True,
         help="The name of the field that will contain the id value. "
-        "Default is 'adlr_id'",
+        "This is a required argument",
     )
     parser.add_argument(
         "--id-prefix",

diff --git a/nemo_curator/scripts/filter_documents.py b/nemo_curator/scripts/filter_documents.py
@@ -207,7 +207,7 @@ def attach_args(
     parser.add_argument(
         "--id-field",
         type=str,
-        default="adlr_id",
+        required=True,
         help="The name of the field within each object of the dataset "
         "file that assigns a unqiue ID to each document. "
         "If this is specified and found within the object, a list of all "

diff --git a/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
@@ -59,7 +59,7 @@ def main(args):
         id_fields=["dataset_id", "doc_id"],
         text_field=args.input_json_text_field,
         profile_dir=args.profile_path,
-        int_to_str_id="adlr_id",
+        int_to_str_id=args.input_json_id_field,
     )
     shuffle.shuffle_docs_on_buckets(
         documents_df=text_ddf,

diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py
@@ -518,11 +518,11 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser:
         argumentHelper.parser.add_argument(
             "--input-json-id-field",
             type=str,
-            default="adlr_id",
+            required=True,
             help="The name of the field within each json object of the jsonl "
             "file that assigns a unqiue ID to each document. "
             "Can be created by running the script "
-            "'./prospector/add_id.py' which adds the field 'adlr_id' "
+            "'../scripts/add_id.py' which adds the field "
             "to the documents in a distributed fashion",
         )
         argumentHelper.parser.add_argument(

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -124,7 +124,7 @@ def test_incorrect_snapshot_order_news(self):
             start_snapshot = "2021-10"
             urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)
 
-    @pytest.mark.flaky(reruns=4)
+    @pytest.mark.flaky(reruns=5)
     def test_uneven_common_crawl_range(self):
         start_snapshot = "2021-03"
         end_snapshot = "2021-11"