Skip to content

Commit

Permalink
small perf improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Nov 2, 2023
1 parent abfc71c commit 6043478
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/datatrove/pipeline/dedup/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def read_sigs(file: InputDataFile, reader_id: int, config: MinhashConfig, index_
for data in read_tuples_from_file(file, f"{config.hashes_per_bucket}{config.hash_format}"):
yield HashSig(sig=data, doc_id=-1, file_id=-1, reader_id=reader_id)
else:
for *data, doc_id in read_tuples_from_file(file, f"{config.hashes_per_bucket}{config.hash_format}", "I"):
yield HashSig(sig=tuple(data), doc_id=doc_id, file_id=reader_id, reader_id=reader_id)
for data in read_tuples_from_file(file, f"{config.hashes_per_bucket}{config.hash_format}", "I"):
yield HashSig(sig=data[:-1], doc_id=data[-1], file_id=reader_id, reader_id=reader_id)


class MinhashDedupSignature(PipelineStep):
Expand Down

0 comments on commit 6043478

Please sign in to comment.