Skip to content

Commit

Permalink
change text hashing function to 64bits on 64bits mode
Browse files Browse the repository at this point in the history
  • Loading branch information
guipenedo committed Nov 3, 2023
1 parent 73e447d commit 7237cd6
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
5 changes: 3 additions & 2 deletions src/datatrove/pipeline/dedup/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from datatrove.data import DocumentsPipeline
from datatrove.io import BaseInputDataFolder, BaseOutputDataFolder, InputDataFile
from datatrove.pipeline.base import PipelineStep
from datatrove.pipeline.dedup.utils import read_tuples_from_file, sha1_hash32, simplify_content
from datatrove.pipeline.dedup.utils import read_tuples_from_file, sha1_hash32, sha1_hash64, simplify_content
from datatrove.pipeline.writers.disk_base import DiskWriter
from datatrove.utils.typeshelper import StatHints

Expand Down Expand Up @@ -93,6 +93,7 @@ def __init__(
self.config = config
self.num_hashes = self.config.num_buckets * self.config.hashes_per_bucket
self._parameters = None
self._hash_func = sha1_hash32 if not self.config.use_64bit_hashes else sha1_hash64

@property
def parameters(self):
Expand Down Expand Up @@ -121,7 +122,7 @@ def set_up_dl_locks(self, dl_lock, up_lock):
def get_shingles(self, text):
return np.array(
[
[sha1_hash32(" ".join(x).encode("utf-8"))]
[self._hash_func(" ".join(x).encode("utf-8"))]
for x in ngrams(word_tokenize(simplify_content(text)), self.config.n_grams)
],
dtype=np.uint64,
Expand Down
12 changes: 12 additions & 0 deletions src/datatrove/pipeline/dedup/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,15 @@ def sha1_hash32(data):
int: an integer hash value that can be encoded using 32 bits.
"""
return struct.unpack("<I", hashlib.sha1(data).digest()[:4])[0]


def sha1_hash64(data):
"""A 64-bit hash function based on SHA1.
Args:
data (bytes): the data to generate 64-bit integer hash from.
Returns:
int: an integer hash value that can be encoded using 64 bits.
"""
return struct.unpack("<Q", hashlib.sha1(data).digest()[:8])[0]

0 comments on commit 7237cd6

Please sign in to comment.