diff --git a/docs/user-guide/gpudeduplication.rst b/docs/user-guide/gpudeduplication.rst index 990cb532..34759b5a 100644 --- a/docs/user-guide/gpudeduplication.rst +++ b/docs/user-guide/gpudeduplication.rst @@ -184,7 +184,7 @@ Python API from nemo_curator import FuzzyDuplicatesConfig config = FuzzyDuplicatesConfig( - cache_dir="/path/to/dedup_outputs", + cache_dir="/path/to/dedup_outputs", # must be cleared between runs id_field="my_id", text_field="text", seed=42, diff --git a/examples/fuzzy_deduplication.py b/examples/fuzzy_deduplication.py index 88fbb020..40f3fae2 100644 --- a/examples/fuzzy_deduplication.py +++ b/examples/fuzzy_deduplication.py @@ -31,7 +31,7 @@ def main(args): dataset_dir = "/path/to/dataset" log_dir = "./" - cache_dir = "./fuzzy_cache" + cache_dir = "./fuzzy_cache" # must be cleared between runs output_dir = "./output" dataset_id_field = "id" dataset_text_field = "text"