diff --git a/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py b/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py index eed48a6d..a7d085b4 100644 --- a/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py +++ b/tutorials/nemo-retriever-synthetic-data-generation/mine_hard_negatives.py @@ -72,8 +72,8 @@ def main(): raise ValueError("Output dir exists already, use a new file name!") if args.input_dir: - input_dataset = DocumentDataset.read_json(args.input_dir) - # input_dataset = DocumentDataset.read_json(os.path.join(args.input_dir,"clustered_dataset")) + input_files = get_all_files_paths_under(args.input_dir, keep_extensions="part") + input_dataset = DocumentDataset.read_json(input_files) else: raise ValueError("provide input file path") @@ -95,10 +95,10 @@ def main(): print("Time taken = {:.2f} s".format(time.time() - st_time)) print("Saving data in jsonl format ...") mined_dataset.df.to_json( - os.path.join(args.output_dir, "mined_dataset"), lines=True, orient="records" + os.path.join(args.output_dir), lines=True, orient="records" ) if __name__ == "__main__": - dask_client = get_client(cluster_type="cpu") + dask_client = get_client(cluster_type="gpu") main() diff --git a/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py b/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py index b52eaee3..54215e6b 100644 --- a/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py +++ b/tutorials/nemo-retriever-synthetic-data-generation/retriever_hardnegative_miner.py @@ -152,7 +152,10 @@ def _get_doc_embeddings(self, p_df: pd.DataFrame): return p_df def _groupby_question(self, pdf): - return pdf.groupby("question").agg({"documents": list}) + pdf2 = pdf.groupby("question").agg({"documents": set}) + pdf2["documents"] = pdf2["documents"].map(lambda x: list(x)) + del pdf + return pdf2 def __call__(self, dataset: DocumentDataset) -> DocumentDataset: @@ -160,7 +163,6 @@ def __call__(self, dataset: DocumentDataset) -> DocumentDataset: df = df.to_backend("pandas") df = df[["question", "documents"]] df = df.map_partitions(self._groupby_question).reset_index() - print("Number partitions in dataset = {}".format(df.npartitions)) df["neg_doc_scores"] = ""