diff --git a/nemo_curator/modules/semantic_dedup.py b/nemo_curator/modules/semantic_dedup.py index 6d01af4f..c8a96774 100644 --- a/nemo_curator/modules/semantic_dedup.py +++ b/nemo_curator/modules/semantic_dedup.py @@ -338,11 +338,10 @@ def __call__(self, embeddings_dataset: DocumentDataset): with performance_report_if_with_ts_suffix(self.profile_dir, "clustering-model"): embeddings_df = embeddings_df[[self.id_col, self.embedding_col]] - - embeddings_df = embeddings_df.to_backend("pandas").persist() embeddings_df = embeddings_df.repartition( partition_size=self.partition_size ) + embeddings_df = embeddings_df.to_backend("pandas").persist() embeddings_df = embeddings_df.to_backend("cudf") cupy_darr = embeddings_df.map_partitions( @@ -362,7 +361,6 @@ def __call__(self, embeddings_dataset: DocumentDataset): t0 = time.time() nearest_cents = kmeans.predict(cupy_darr) self.logger.info(f"Time taken for KMeans Predict: {time.time() - t0}") - t0 = time.time() embeddings_df["nearest_cent"] = nearest_cents.astype(np.int32) del nearest_cents diff --git a/tests/test_semdedup.py b/tests/test_semdedup.py index a2436ca5..4cc66901 100644 --- a/tests/test_semdedup.py +++ b/tests/test_semdedup.py @@ -55,7 +55,6 @@ def gpu_client(self, request): request.cls.cluster = cluster yield - @pytest.mark.skip(reason="TODO: Hangs indefinitely with RAPIDS 24.12") def test_sem_dedup( self, dedup_data,