From dfd6b63dc71ace15858fe9a753389313fd6f34ef Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Sep 2024 13:43:34 -0700 Subject: [PATCH 1/2] fix commit Signed-off-by: Sarah Yurick --- docs/user-guide/gpudeduplication.rst | 3 ++- docs/user-guide/kubernetescurator.rst | 2 +- nemo_curator/scripts/add_id.py | 4 ++-- nemo_curator/scripts/filter_documents.py | 2 +- nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py | 2 +- nemo_curator/utils/script_utils.py | 4 ++-- tests/test_download.py | 2 +- 7 files changed, 10 insertions(+), 9 deletions(-) diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst index bd7024f7..407c2319 100644 --- a/docs/user-guide/gpudeduplication.rst +++ b/docs/user-guide/gpudeduplication.rst @@ -41,10 +41,11 @@ This can be accomplished using the :code:`add_id` module within the NeMo Curator .. code-block:: bash add_id \ + --id-field-name="my_id" \ --input-data-dir= \ --log-dir=./log/add_id -By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001". +This will create a new field named :code:`my_id` within each json document which will have the form "doc_prefix-000001". If the dataset already has a unique ID this step can be skipped. **Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the diff --git a/docs/user-guide/kubernetescurator.rst b/docs/user-guide/kubernetescurator.rst index cb29eef7..d695286a 100644 --- a/docs/user-guide/kubernetescurator.rst +++ b/docs/user-guide/kubernetescurator.rst @@ -243,7 +243,7 @@ At this point you can tail the logs and look for ``Finished!`` in ``/nemo-worksp # Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31 # Computing hashes for /nemo-workspace/my_dataset - # adlr_id _hashes + # id _hashes # 0 cc-2023-14-0397113620 91b77eae49c10a65d485ac8ca18d6c43 # 1 cc-2023-14-0397113621 a266f0794cc8ffbd431823e6930e4f80 # 2 cc-2023-14-0397113622 baee533e2eddae764de2cd6faaa1286c diff --git a/nemo_curator/scripts/add_id.py b/nemo_curator/scripts/add_id.py index e8419297..fd0c5bc9 100644 --- a/nemo_curator/scripts/add_id.py +++ b/nemo_curator/scripts/add_id.py @@ -87,9 +87,9 @@ def attach_args( parser.add_argument( "--id-field-name", type=str, - default="adlr_id", + required=True, help="The name of the field that will contain the id value. " - "Default is 'adlr_id'", + "This is a required argument", ) parser.add_argument( "--id-prefix", diff --git a/nemo_curator/scripts/filter_documents.py b/nemo_curator/scripts/filter_documents.py index 023d6d92..845d36d3 100644 --- a/nemo_curator/scripts/filter_documents.py +++ b/nemo_curator/scripts/filter_documents.py @@ -207,7 +207,7 @@ def attach_args( parser.add_argument( "--id-field", type=str, - default="adlr_id", + required=True, help="The name of the field within each object of the dataset " "file that assigns a unqiue ID to each document. " "If this is specified and found within the object, a list of all " diff --git a/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py index e46f36e2..871c4270 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py +++ b/nemo_curator/scripts/fuzzy_deduplication/jaccard_shuffle.py @@ -59,7 +59,7 @@ def main(args): id_fields=["dataset_id", "doc_id"], text_field=args.input_json_text_field, profile_dir=args.profile_path, - int_to_str_id="adlr_id", + int_to_str_id=args.input_json_id_field, ) shuffle.shuffle_docs_on_buckets( documents_df=text_ddf, diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index 663970ed..cc503978 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -497,11 +497,11 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser: argumentHelper.parser.add_argument( "--input-json-id-field", type=str, - default="adlr_id", + required=True, help="The name of the field within each json object of the jsonl " "file that assigns a unqiue ID to each document. " "Can be created by running the script " - "'./prospector/add_id.py' which adds the field 'adlr_id' " + "'./prospector/add_id.py' which adds the field " "to the documents in a distributed fashion", ) argumentHelper.parser.add_argument( diff --git a/tests/test_download.py b/tests/test_download.py index 8ad042d6..3a8897bc 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -124,7 +124,7 @@ def test_incorrect_snapshot_order_news(self): start_snapshot = "2021-10" urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True) - @pytest.mark.flaky(reruns=4) + @pytest.mark.flaky(reruns=5) def test_uneven_common_crawl_range(self): start_snapshot = "2021-03" end_snapshot = "2021-11" From 59720ab309263dde68e1890b8d174254cb9befe7 Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Fri, 6 Sep 2024 15:37:07 -0700 Subject: [PATCH 2/2] add_id path Signed-off-by: Sarah Yurick --- nemo_curator/utils/script_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/utils/script_utils.py b/nemo_curator/utils/script_utils.py index cc503978..877d4c3f 100644 --- a/nemo_curator/utils/script_utils.py +++ b/nemo_curator/utils/script_utils.py @@ -501,7 +501,7 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser: help="The name of the field within each json object of the jsonl " "file that assigns a unqiue ID to each document. " "Can be created by running the script " - "'./prospector/add_id.py' which adds the field " + "'../scripts/add_id.py' which adds the field " "to the documents in a distributed fashion", ) argumentHelper.parser.add_argument(