enh: cythonize downsample_grid (~10x speed-up)

DC-analysis · Jan 3, 2024 · 12c8326 · 12c8326
1 parent 82caa22
commit 12c8326
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 59 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,3 +1,6 @@
+0.56.4
+ - fix: integer overflow in downsample_grid
+ - enh: cythonize downsample_grid (~10x speed-up)
 0.56.3
  - fix: regression missing check for basin availability
 0.56.2

diff --git a/dclab/downsampling.py → dclab/downsampling.pyx b/dclab/downsampling.py → dclab/downsampling.pyx
@@ -1,8 +1,14 @@
 """Content-based downsampling of ndarrays"""
 import numpy as np
+cimport numpy as cnp
 
 from .cached import Cache
 
+# We are using the numpy array API
+cnp.import_array()
+ctypedef cnp.uint8_t uint8
+ctypedef cnp.uint32_t uint32
+ctypedef cnp.int64_t int64
 
 def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
     """Downsampling by randomly removing points
@@ -31,7 +37,7 @@ def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
     rs = np.random.RandomState(seed=47).get_state()
     np.random.set_state(rs)
 
-    samples = int(samples)
+    cdef uint32 samples_int = np.uint32(samples)
 
     if remove_invalid:
         # slice out nans and infs
@@ -40,10 +46,10 @@ def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
     else:
         pool = a
 
-    if samples and (samples < pool.shape[0]):
+    if samples_int and (samples_int < pool.shape[0]):
         keep = np.zeros_like(pool, dtype=bool)
         keep_ids = np.random.choice(np.arange(pool.size),
-                                    size=samples,
+                                    size=samples_int,
                                     replace=False)
         keep[keep_ids] = True
         dsa = pool[keep]
@@ -96,30 +102,23 @@ def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
         Only returned if `ret_idx` is True.
         A boolean array such that `a[idx] == dsa`
     """
+    cdef uint32 samples_int = np.uint32(samples)
+
     # fixed random state for this method
     rs = np.random.RandomState(seed=47).get_state()
 
-    if remove_invalid:
-        # Remove nan and inf values straight from the beginning.
-        # This might result in arrays smaller than `samples`,
-        # but it makes sure that no inf/nan values will be plotted.
-        bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
-        ad = a[~bad]
-        bd = b[~bad]
-    else:
-        bad = np.zeros_like(a, dtype=bool)
-        ad = a
-        bd = b
-
     keep = np.ones_like(a, dtype=bool)
-    keep[bad] = False
+    bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
+    good = ~bad
 
-    samples = int(samples)
+    # We are generally not interested in bad data.
+    keep[bad] = False
+    bd = b[good]
+    ad = a[good]
 
-    if samples and samples < ad.size:
-        # The events to keep
-        keepd = np.zeros_like(ad, dtype=bool)
+    cdef int64 diff
 
+    if samples_int and samples_int < ad.size:
         # 1. Produce evenly distributed samples
         # Choosing grid-size:
         # - large numbers tend to show actual structures of the sample,
@@ -130,68 +129,102 @@ def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
         # 300 is about the size of the plot in marker sizes and yields
         # good results.
         grid_size = 300
-        xpx = norm(ad, ad, bd) * grid_size
-        ypx = norm(bd, bd, ad) * grid_size
         # The events on the grid to process
-        toproc = np.ones((grid_size, grid_size), dtype=bool)
-
-        for ii in range(xpx.size):
-            xi = xpx[ii]
-            yi = ypx[ii]
-            # filter for overlapping events
-            # Note that `valid` is used here to promote only valid
-            # events in this step. However, in step 2, invalid events
-            # could be added back. To avoid this scenario, the
-            # parameter `remove_invalid` should be set to True.
-            if valid(xi, yi) and toproc[int(xi-1), int(yi-1)]:
-                toproc[int(xi-1), int(yi-1)] = False
-                # include event
-                keepd[ii] = True
+        toproc = np.ones((grid_size, grid_size), dtype=np.uint8)
+
+        x_discrete = np.array(norm(ad) * (grid_size - 1), dtype=np.uint32)
+        y_discrete = np.array(norm(bd) * (grid_size - 1), dtype=np.uint32)
+
+        # The events to keep
+        keepd = np.zeros_like(ad, dtype=np.uint8)
+        populate_grid(x_discrete=x_discrete,
+                      y_discrete=y_discrete,
+                      toproc=toproc,
+                      keepd=keepd
+                      )
+
+        keepdb = np.array(keepd, dtype=bool)
 
         # 2. Make sure that we reach `samples` by adding or
         # removing events.
-        diff = np.sum(keepd) - samples
+        diff = np.sum(keepdb) - samples_int
         if diff > 0:
             # Too many samples
-            rem_indices = np.where(keepd)[0]
+            rem_indices = np.where(keepdb)[0]
             np.random.set_state(rs)
             rem = np.random.choice(rem_indices,
                                    size=diff,
                                    replace=False)
-            keepd[rem] = False
+            keepdb[rem] = False
         elif diff < 0:
             # Not enough samples
-            add_indices = np.where(~keepd)[0]
+            add_indices = np.where(~keepdb)[0]
             np.random.set_state(rs)
             add = np.random.choice(add_indices,
                                    size=abs(diff),
                                    replace=False)
-            keepd[add] = True
-
-        assert np.sum(keepd) == samples, "sanity check"
-        asd = ad[keepd]
-        bsd = bd[keepd]
-        assert np.allclose(ad[keepd], asd, equal_nan=True), "sanity check"
-        assert np.allclose(bd[keepd], bsd, equal_nan=True), "sanity check"
-        keep[~bad] = keepd
-    else:
-        asd = ad
-        bsd = bd
+            keepdb[add] = True
+
+        # paulmueller 2024-01-03
+        # assert np.sum(keepdb) <= samples_int, "sanity check"
+
+        keep[good] = keepdb
+
+        # paulmueller 2024-01-03
+        # assert np.sum(keep) <= samples_int, "sanity check"
+
+    if not remove_invalid:
+        diff_bad = (samples_int or keep.size) - np.sum(keep)
+        if diff_bad > 0:
+            # Add a few of the invalid values so that in the end
+            # we have the desired array size.
+            add_indices_bad = np.where(bad)[0]
+            np.random.set_state(rs)
+            add_bad = np.random.choice(add_indices_bad,
+                                       size=diff_bad,
+                                       replace=False)
+            keep[add_bad] = True
+
+    # paulmueller 2024-01-03
+    # if samples_int and not remove_invalid:
+    #     assert np.sum(keep) == samples_int, "sanity check"
+
+    asd = a[keep]
+    bsd = b[keep]
 
     if ret_idx:
         return asd, bsd, keep
     else:
         return asd, bsd
 
 
+def populate_grid(x_discrete, y_discrete, keepd, toproc):
+    # Py_ssize_t is the proper C type for Python array indices.
+    cdef int iter_size = x_discrete.size
+    cdef Py_ssize_t ii
+    cdef uint32[:] x_view = x_discrete
+    cdef uint32[:] y_view = y_discrete
+    # Numpy uses uint8 internally to represent boolean arrays
+    cdef uint8[:] keepd_view = keepd
+    cdef uint8[:, :] toproc_view = toproc
+
+    for ii in range(iter_size):
+        # filter for overlapping events
+        xi = x_view[ii]
+        yi = y_view[ii]
+        if toproc_view[xi, yi]:
+            toproc_view[xi, yi] = 0
+            # include event
+            keepd_view[ii] = 1
+
+
 def valid(a, b):
     """Check whether `a` and `b` are not inf or nan"""
     return ~(np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b))
 
 
-def norm(a, ref1, ref2):
-    """
-    Normalize `a` with min/max values of `ref1`, using all elements of
-    `ref1` where the `ref1` and `ref2` are not nan or inf"""
-    ref = ref1[valid(ref1, ref2)]
-    return (a-ref.min())/(ref.max()-ref.min())
+def norm(a):
+    """Normalize `a` with its min/max values"""
+    rmin = a.min()
+    rptp = a.max() - rmin
+    return (a - rmin) / rptp
diff --git a/setup.py b/setup.py
@@ -5,6 +5,10 @@
 
 setup(
     ext_modules=[
+        Extension("dclab.downsampling",
+                  sources=["dclab/downsampling.pyx"],
+                  include_dirs=[np.get_include(), "_shared"]
+                  ),
         Extension("dclab.external.skimage._shared.geometry",
                   sources=["dclab/external/skimage/_shared/geometry.pyx"],
                   include_dirs=[np.get_include()]

diff --git a/tests/test_downsampling.py b/tests/test_downsampling.py
@@ -41,12 +41,13 @@ def test_grid_nan():
     b = np.arange(50, 150, dtype=float)
     ads, bds, idx = downsampling.downsample_grid(a=a,
                                                  b=b,
-                                                 samples=50,
+                                                 samples=52,
                                                  ret_idx=True,
                                                  remove_invalid=False)
+    assert np.sum(idx) == 52
     assert np.allclose(a[idx], ads, atol=1e-14, rtol=0, equal_nan=True)
     assert np.allclose(b[idx], bds, atol=1e-14, rtol=0, equal_nan=True)
-    assert np.sum(np.isnan(ads)) == 1, "depends on random state"
+    assert np.sum(np.isnan(ads)) == 2, "2 more samples requested than non-nan"
 
     ads2, bds2 = downsampling.downsample_grid(a=a,
                                               b=b,
@@ -66,6 +67,33 @@ def test_grid_nan():
     assert ads3.size == 50, "only 50 valid values"
     assert np.all(ads3 == a[idx3])
     assert np.all(bds3 == b[idx3])
+    assert np.all(bds3 == b[:50])
+
+
+def test_grid_nan_keep_all():
+    a = np.arange(100, dtype=float)
+    a[50:] = np.nan
+    b = np.arange(50, 150, dtype=float)
+    ads, bds, idx = downsampling.downsample_grid(a=a,
+                                                 b=b,
+                                                 samples=0,
+                                                 ret_idx=True,
+                                                 remove_invalid=False)
+    assert np.allclose(a, ads, equal_nan=True)
+    assert np.allclose(b, bds, equal_nan=True)
+
+
+def test_grid_nan_keep_all_except_invalid():
+    a = np.arange(100, dtype=float)
+    a[50:] = np.nan
+    b = np.arange(50, 150, dtype=float)
+    ads, bds, idx = downsampling.downsample_grid(a=a,
+                                                 b=b,
+                                                 samples=0,
+                                                 ret_idx=True,
+                                                 remove_invalid=True)
+    assert np.allclose(a[:50], ads, equal_nan=True)
+    assert np.allclose(b[:50], bds, equal_nan=True)
 
 
 def test_rand_nan():