Skip to content

Commit

Permalink
Better hashing algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
bouweandela committed Sep 3, 2024
1 parent 44ddcda commit cf1ea3d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 11 deletions.
23 changes: 12 additions & 11 deletions lib/iris/_concatenate.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,18 +305,19 @@ def _hash_ndarray(a: np.ndarray) -> np.ndarray:
An array of shape (1,) containing the hash value.
"""
# Fill masked arrays with the default fill_value so different values under
# the mask and the value of the fill_value attribute do not affect the hash.
if isinstance(a, np.ma.MaskedArray):
a = np.ma.masked_array(
a.filled(np.ma.default_fill_value(a.dtype)),
mask=a.mask,
shrink=False,
)
# Hash the bytes representing the array data.
hash = xxh3_64(a.data.tobytes())
# Include the array dtype as it is not preserved by `ndarray.tobytes()`.
hash.update(str(a.dtype).encode("utf-8"))
hash = xxh3_64(f"dtype={a.dtype}".encode("utf-8"))

# Hash the bytes representing the array data.
hash.update(b"data=")
if isinstance(a, np.ma.MaskedArray):
# Hash only the unmasked data
hash.update(a.compressed().tobytes())
# Hash the mask
hash.update(b"mask=")
hash.update(a.mask.tobytes())
else:
hash.update(a.tobytes())
return np.frombuffer(hash.digest(), dtype=np.int64)


Expand Down
5 changes: 5 additions & 0 deletions lib/iris/tests/unit/concatenate/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@
np.ma.array([1, 3], mask=[0, 1], fill_value=10),
True,
),
(
np.ma.masked_array([1], mask=[True]),
np.array([np.ma.default_fill_value(np.dtype("int64"))]),
False,
),
(np.array(["a", "b"]), np.array(["a", "b"]), True),
(np.array(["a"]), np.array(["b"]), False),
(da.asarray(["a", "b"], chunks=1), da.asarray(["a", "b"], chunks=1), True),
Expand Down

0 comments on commit cf1ea3d

Please sign in to comment.