Skip to content

Commit

Permalink
removed duplicates in pos_docs, Signed-off by [email protected]
Browse files Browse the repository at this point in the history
Signed-off-by: Vinay Raman <[email protected]>
  • Loading branch information
vinay-raman committed Feb 11, 2025
1 parent 531b5ac commit 2082f16
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def main():
raise ValueError("Output dir exists already, use a new file name!")

if args.input_dir:
input_dataset = DocumentDataset.read_json(args.input_dir)
# input_dataset = DocumentDataset.read_json(os.path.join(args.input_dir,"clustered_dataset"))
input_files = get_all_files_paths_under(args.input_dir, keep_extensions="part")
input_dataset = DocumentDataset.read_json(input_files)
else:
raise ValueError("provide input file path")

Expand All @@ -95,10 +95,10 @@ def main():
print("Time taken = {:.2f} s".format(time.time() - st_time))
print("Saving data in jsonl format ...")
mined_dataset.df.to_json(
os.path.join(args.output_dir, "mined_dataset"), lines=True, orient="records"
os.path.join(args.output_dir), lines=True, orient="records"
)


if __name__ == "__main__":
dask_client = get_client(cluster_type="cpu")
dask_client = get_client(cluster_type="gpu")
main()
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,17 @@ def _get_doc_embeddings(self, p_df: pd.DataFrame):
return p_df

def _groupby_question(self, pdf):
return pdf.groupby("question").agg({"documents": list})
pdf2 = pdf.groupby("question").agg({"documents": set})
pdf2["documents"] = pdf2["documents"].map(lambda x: list(x))
del pdf
return pdf2

def __call__(self, dataset: DocumentDataset) -> DocumentDataset:

df = dataset.df
df = df.to_backend("pandas")
df = df[["question", "documents"]]
df = df.map_partitions(self._groupby_question).reset_index()

print("Number partitions in dataset = {}".format(df.npartitions))

df["neg_doc_scores"] = ""
Expand Down

0 comments on commit 2082f16

Please sign in to comment.