From cf1ea3d6e6bcc727fa5b304bf6126dc36578b5b6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 3 Sep 2024 09:25:40 +0200 Subject: [PATCH] Better hashing algorithm --- lib/iris/_concatenate.py | 23 ++++++++++--------- .../tests/unit/concatenate/test_hashing.py | 5 ++++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/lib/iris/_concatenate.py b/lib/iris/_concatenate.py index 0d8fdd2ff3..90f2438742 100644 --- a/lib/iris/_concatenate.py +++ b/lib/iris/_concatenate.py @@ -305,18 +305,19 @@ def _hash_ndarray(a: np.ndarray) -> np.ndarray: An array of shape (1,) containing the hash value. """ - # Fill masked arrays with the default fill_value so different values under - # the mask and the value of the fill_value attribute do not affect the hash. - if isinstance(a, np.ma.MaskedArray): - a = np.ma.masked_array( - a.filled(np.ma.default_fill_value(a.dtype)), - mask=a.mask, - shrink=False, - ) - # Hash the bytes representing the array data. - hash = xxh3_64(a.data.tobytes()) # Include the array dtype as it is not preserved by `ndarray.tobytes()`. - hash.update(str(a.dtype).encode("utf-8")) + hash = xxh3_64(f"dtype={a.dtype}".encode("utf-8")) + + # Hash the bytes representing the array data. + hash.update(b"data=") + if isinstance(a, np.ma.MaskedArray): + # Hash only the unmasked data + hash.update(a.compressed().tobytes()) + # Hash the mask + hash.update(b"mask=") + hash.update(a.mask.tobytes()) + else: + hash.update(a.tobytes()) return np.frombuffer(hash.digest(), dtype=np.int64) diff --git a/lib/iris/tests/unit/concatenate/test_hashing.py b/lib/iris/tests/unit/concatenate/test_hashing.py index bed2a7d741..7a56be1db8 100644 --- a/lib/iris/tests/unit/concatenate/test_hashing.py +++ b/lib/iris/tests/unit/concatenate/test_hashing.py @@ -38,6 +38,11 @@ np.ma.array([1, 3], mask=[0, 1], fill_value=10), True, ), + ( + np.ma.masked_array([1], mask=[True]), + np.array([np.ma.default_fill_value(np.dtype("int64"))]), + False, + ), (np.array(["a", "b"]), np.array(["a", "b"]), True), (np.array(["a"]), np.array(["b"]), False), (da.asarray(["a", "b"], chunks=1), da.asarray(["a", "b"], chunks=1), True),