Skip to content

Commit

Permalink
Merge pull request NVIDIA#229 from sarahyurick/adlr_id
Browse files Browse the repository at this point in the history
Clean up ID logic
  • Loading branch information
sarahyurick authored Sep 9, 2024
2 parents b1562bf + 59720ab commit 658d397
Show file tree
Hide file tree
Showing 7 changed files with 10 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/user-guide/gpudeduplication.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ This can be accomplished using the :code:`add_id` module within the NeMo Curator
.. code-block:: bash
add_id \
--id-field-name="my_id" \
--input-data-dir=<Path to directory containing jsonl files> \
--log-dir=./log/add_id
By default, this will create a new field named :code:`adlr_id` within each json document which will have the form "doc_prefix-000001".
This will create a new field named :code:`my_id` within each json document which will have the form "doc_prefix-000001".
If the dataset already has a unique ID this step can be skipped.

**Note**: Fuzzy deduplication only works with numeric ID's or the specific ID format generated by the :code:`add_id` script. If the
Expand Down
2 changes: 1 addition & 1 deletion docs/user-guide/kubernetescurator.rst
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ At this point you can tail the logs and look for ``Finished!`` in ``/nemo-worksp
# Writing to: /nemo-workspace/curator/script.log /nemo-workspace/curator/script.log.24_03_27-15-52-31
# Computing hashes for /nemo-workspace/my_dataset
# adlr_id _hashes
# id _hashes
# 0 cc-2023-14-0397113620 91b77eae49c10a65d485ac8ca18d6c43
# 1 cc-2023-14-0397113621 a266f0794cc8ffbd431823e6930e4f80
# 2 cc-2023-14-0397113622 baee533e2eddae764de2cd6faaa1286c
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/scripts/add_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ def attach_args(
parser.add_argument(
"--id-field-name",
type=str,
default="adlr_id",
required=True,
help="The name of the field that will contain the id value. "
"Default is 'adlr_id'",
"This is a required argument",
)
parser.add_argument(
"--id-prefix",
Expand Down
2 changes: 1 addition & 1 deletion nemo_curator/scripts/filter_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def attach_args(
parser.add_argument(
"--id-field",
type=str,
default="adlr_id",
required=True,
help="The name of the field within each object of the dataset "
"file that assigns a unqiue ID to each document. "
"If this is specified and found within the object, a list of all "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def main(args):
id_fields=["dataset_id", "doc_id"],
text_field=args.input_json_text_field,
profile_dir=args.profile_path,
int_to_str_id="adlr_id",
int_to_str_id=args.input_json_id_field,
)
shuffle.shuffle_docs_on_buckets(
documents_df=text_ddf,
Expand Down
4 changes: 2 additions & 2 deletions nemo_curator/utils/script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,11 +518,11 @@ def parse_gpu_dedup_args(description: str) -> argparse.ArgumentParser:
argumentHelper.parser.add_argument(
"--input-json-id-field",
type=str,
default="adlr_id",
required=True,
help="The name of the field within each json object of the jsonl "
"file that assigns a unqiue ID to each document. "
"Can be created by running the script "
"'./prospector/add_id.py' which adds the field 'adlr_id' "
"'../scripts/add_id.py' which adds the field "
"to the documents in a distributed fashion",
)
argumentHelper.parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_incorrect_snapshot_order_news(self):
start_snapshot = "2021-10"
urls = get_common_crawl_urls(start_snapshot, end_snapshot, news=True)

@pytest.mark.flaky(reruns=4)
@pytest.mark.flaky(reruns=5)
def test_uneven_common_crawl_range(self):
start_snapshot = "2021-03"
end_snapshot = "2021-11"
Expand Down

0 comments on commit 658d397

Please sign in to comment.