From 85916c5b517913b220a1093d1dbd3a6ca702ebb8 Mon Sep 17 00:00:00 2001 From: Ryan Wolf Date: Tue, 17 Sep 2024 10:27:45 -0700 Subject: [PATCH] Fix id field Signed-off-by: Ryan Wolf --- nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py index 40c42ec9..e434f62a 100644 --- a/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py +++ b/nemo_curator/scripts/fuzzy_deduplication/minhash_lsh.py @@ -54,7 +54,7 @@ def main(args): dask_cudf.read_parquet(data_path, blocksize="2GB", aggregate_files=True) ) df = dask_cudf.concat(dfs, ignore_unknown_divisions=True) - df = df[~df.id_field.isna()] + df = df[~df[id_field].isna()] df = df.map_partitions( convert_str_id_to_int, id_column=id_field,