From dfd6b63dc71ace15858fe9a753389313fd6f34ef Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 6 Sep 2024 13:43:34 -0700
Subject: [PATCH 1/2] fix commit

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 docs/user-guide/gpudeduplication.rst                        | 3 ++-
 docs/user-guide/kubernetescurator.rst                       | 2 +-
 nemo_curator/scripts/add_id.py                              | 4 ++--
 nemo_curator/scripts/filter_documents.py                    | 2 +-
 nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py | 2 +-
 nemo_curator/utils/script_utils.py                          | 4 ++--
 tests/test_download.py                                      | 2 +-
 7 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst
index bd7024f7..407c2319 100644
--- a/docs/user-guide/gpudeduplication.rst
+++ b/docs/user-guide/gpudeduplication.rst
@@ -41,10 +41,11 @@ This can be accomplished using the :code:`add_id` module within the NeMo Curator
 .. code-block:: bash
 
          add_id \
+           --id-field-name="my_id" \
            --input-data-dir=<Path to directory containing jsonl files> \
            --log-dir=./log/add_id
 
-By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001".
+This will create a new field named :code:`my_id` within each json document which will have the form "doc_prefix-000001".
 If the dataset already has a unique ID this step can be skipped.
 
 **Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the
diff --git a/docs/user-guide/kubernetescurator.rst b/docs/user-guide/kubernetescurator.rst
index cb29eef7..d695286a 100644
--- a/docs/user-guide/kubernetescurator.rst
+++ b/docs/user-guide/kubernetescurator.rst
@@ -243,7 +243,7 @@ At this point you can tail the logs and look for ``Finished!`` in ``/nemo-worksp
 
     # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31
     # Computing hashes for /nemo-workspace/my_dataset
-    #                  adlr_id                           _hashes
+    #                       id                           _hashes
     # 0  cc-2023-14-0397113620  91b77eae49c10a65d485ac8ca18d6c43
     # 1  cc-2023-14-0397113621  a266f0794cc8ffbd431823e6930e4f80
     # 2  cc-2023-14-0397113622  baee533e2eddae764de2cd6faaa1286c
diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py
index e8419297..fd0c5bc9 100644
--- a/nemo_curator/scripts/add_id.py
+++ b/nemo_curator/scripts/add_id.py
@@ -87,9 +87,9 @@ def attach_args(
     parser.add_argument(
         "--id-field-name",
         type=str,
-        default="adlr_id",
+        required=True,
         help="The name of the field that will contain the id value. "
-        "Default is 'adlr_id'",
+        "This is a required argument",
     )
     parser.add_argument(
         "--id-prefix",
diff --git a/nemo_curator/scripts/filter_documents.py b/nemo_curator/scripts/filter_documents.py
index 023d6d92..845d36d3 100644
--- a/nemo_curator/scripts/filter_documents.py
+++ b/nemo_curator/scripts/filter_documents.py
@@ -207,7 +207,7 @@ def attach_args(
     parser.add_argument(
         "--id-field",
         type=str,
-        default="adlr_id",
+        required=True,
         help="The name of the field within each object of the dataset "
         "file that assigns a unqiue ID to each document. "
         "If this is specified and found within the object, a list of all "
diff --git a/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
index e46f36e2..871c4270 100644
--- a/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
+++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py
@@ -59,7 +59,7 @@ def main(args):
         id_fields=["dataset_id", "doc_id"],
         text_field=args.input_json_text_field,
         profile_dir=args.profile_path,
-        int_to_str_id="adlr_id",
+        int_to_str_id=args.input_json_id_field,
     )
     shuffle.shuffle_docs_on_buckets(
         documents_df=text_ddf,
diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py
index 663970ed..cc503978 100644
--- a/nemo_curator/utils/script_utils.py
+++ b/nemo_curator/utils/script_utils.py
@@ -497,11 +497,11 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser:
         argumentHelper.parser.add_argument(
             "--input-json-id-field",
             type=str,
-            default="adlr_id",
+            required=True,
             help="The name of the field within each json object of the jsonl "
             "file that assigns a unqiue ID to each document. "
             "Can be created by running the script "
-            "'./prospector/add_id.py' which adds the field 'adlr_id' "
+            "'./prospector/add_id.py' which adds the field "
             "to the documents in a distributed fashion",
         )
         argumentHelper.parser.add_argument(
diff --git a/tests/test_download.py b/tests/test_download.py
index 8ad042d6..3a8897bc 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -124,7 +124,7 @@ def test_incorrect_snapshot_order_news(self):
             start_snapshot = "2021-10"
             urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)
 
-    @pytest.mark.flaky(reruns=4)
+    @pytest.mark.flaky(reruns=5)
     def test_uneven_common_crawl_range(self):
         start_snapshot = "2021-03"
         end_snapshot = "2021-11"

From 59720ab309263dde68e1890b8d174254cb9befe7 Mon Sep 17 00:00:00 2001
From: Sarah Yurick <sarahyurick@gmail.com>
Date: Fri, 6 Sep 2024 15:37:07 -0700
Subject: [PATCH 2/2] add_id path

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
---
 nemo_curator/utils/script_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py
index cc503978..877d4c3f 100644
--- a/nemo_curator/utils/script_utils.py
+++ b/nemo_curator/utils/script_utils.py
@@ -501,7 +501,7 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser:
             help="The name of the field within each json object of the jsonl "
             "file that assigns a unqiue ID to each document. "
             "Can be created by running the script "
-            "'./prospector/add_id.py' which adds the field "
+            "'../scripts/add_id.py' which adds the field "
             "to the documents in a distributed fashion",
         )
         argumentHelper.parser.add_argument(