Skip to content

Commit

Permalink
enh: cythonize downsample_grid (~10x speed-up)
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Jan 3, 2024
1 parent 82caa22 commit 12c8326
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 59 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
0.56.4
- fix: integer overflow in downsample_grid
- enh: cythonize downsample_grid (~10x speed-up)
0.56.3
- fix: regression missing check for basin availability
0.56.2
Expand Down
147 changes: 90 additions & 57 deletions dclab/downsampling.py → dclab/downsampling.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
"""Content-based downsampling of ndarrays"""
import numpy as np
cimport numpy as cnp

from .cached import Cache

# We are using the numpy array API
cnp.import_array()
ctypedef cnp.uint8_t uint8
ctypedef cnp.uint32_t uint32
ctypedef cnp.int64_t int64

def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
"""Downsampling by randomly removing points
Expand Down Expand Up @@ -31,7 +37,7 @@ def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
rs = np.random.RandomState(seed=47).get_state()
np.random.set_state(rs)

samples = int(samples)
cdef uint32 samples_int = np.uint32(samples)

if remove_invalid:
# slice out nans and infs
Expand All @@ -40,10 +46,10 @@ def downsample_rand(a, samples, remove_invalid=False, ret_idx=False):
else:
pool = a

if samples and (samples < pool.shape[0]):
if samples_int and (samples_int < pool.shape[0]):
keep = np.zeros_like(pool, dtype=bool)
keep_ids = np.random.choice(np.arange(pool.size),
size=samples,
size=samples_int,
replace=False)
keep[keep_ids] = True
dsa = pool[keep]
Expand Down Expand Up @@ -96,30 +102,23 @@ def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
Only returned if `ret_idx` is True.
A boolean array such that `a[idx] == dsa`
"""
cdef uint32 samples_int = np.uint32(samples)

# fixed random state for this method
rs = np.random.RandomState(seed=47).get_state()

if remove_invalid:
# Remove nan and inf values straight from the beginning.
# This might result in arrays smaller than `samples`,
# but it makes sure that no inf/nan values will be plotted.
bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
ad = a[~bad]
bd = b[~bad]
else:
bad = np.zeros_like(a, dtype=bool)
ad = a
bd = b

keep = np.ones_like(a, dtype=bool)
keep[bad] = False
bad = np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b)
good = ~bad

samples = int(samples)
# We are generally not interested in bad data.
keep[bad] = False
bd = b[good]
ad = a[good]

if samples and samples < ad.size:
# The events to keep
keepd = np.zeros_like(ad, dtype=bool)
cdef int64 diff

if samples_int and samples_int < ad.size:
# 1. Produce evenly distributed samples
# Choosing grid-size:
# - large numbers tend to show actual structures of the sample,
Expand All @@ -130,68 +129,102 @@ def downsample_grid(a, b, samples, remove_invalid=False, ret_idx=False):
# 300 is about the size of the plot in marker sizes and yields
# good results.
grid_size = 300
xpx = norm(ad, ad, bd) * grid_size
ypx = norm(bd, bd, ad) * grid_size
# The events on the grid to process
toproc = np.ones((grid_size, grid_size), dtype=bool)

for ii in range(xpx.size):
xi = xpx[ii]
yi = ypx[ii]
# filter for overlapping events
# Note that `valid` is used here to promote only valid
# events in this step. However, in step 2, invalid events
# could be added back. To avoid this scenario, the
# parameter `remove_invalid` should be set to True.
if valid(xi, yi) and toproc[int(xi-1), int(yi-1)]:
toproc[int(xi-1), int(yi-1)] = False
# include event
keepd[ii] = True
toproc = np.ones((grid_size, grid_size), dtype=np.uint8)

x_discrete = np.array(norm(ad) * (grid_size - 1), dtype=np.uint32)
y_discrete = np.array(norm(bd) * (grid_size - 1), dtype=np.uint32)

# The events to keep
keepd = np.zeros_like(ad, dtype=np.uint8)
populate_grid(x_discrete=x_discrete,
y_discrete=y_discrete,
toproc=toproc,
keepd=keepd
)

keepdb = np.array(keepd, dtype=bool)

# 2. Make sure that we reach `samples` by adding or
# removing events.
diff = np.sum(keepd) - samples
diff = np.sum(keepdb) - samples_int
if diff > 0:
# Too many samples
rem_indices = np.where(keepd)[0]
rem_indices = np.where(keepdb)[0]
np.random.set_state(rs)
rem = np.random.choice(rem_indices,
size=diff,
replace=False)
keepd[rem] = False
keepdb[rem] = False
elif diff < 0:
# Not enough samples
add_indices = np.where(~keepd)[0]
add_indices = np.where(~keepdb)[0]
np.random.set_state(rs)
add = np.random.choice(add_indices,
size=abs(diff),
replace=False)
keepd[add] = True

assert np.sum(keepd) == samples, "sanity check"
asd = ad[keepd]
bsd = bd[keepd]
assert np.allclose(ad[keepd], asd, equal_nan=True), "sanity check"
assert np.allclose(bd[keepd], bsd, equal_nan=True), "sanity check"
keep[~bad] = keepd
else:
asd = ad
bsd = bd
keepdb[add] = True

# paulmueller 2024-01-03
# assert np.sum(keepdb) <= samples_int, "sanity check"

keep[good] = keepdb

# paulmueller 2024-01-03
# assert np.sum(keep) <= samples_int, "sanity check"

if not remove_invalid:
diff_bad = (samples_int or keep.size) - np.sum(keep)
if diff_bad > 0:
# Add a few of the invalid values so that in the end
# we have the desired array size.
add_indices_bad = np.where(bad)[0]
np.random.set_state(rs)
add_bad = np.random.choice(add_indices_bad,
size=diff_bad,
replace=False)
keep[add_bad] = True

# paulmueller 2024-01-03
# if samples_int and not remove_invalid:
# assert np.sum(keep) == samples_int, "sanity check"

asd = a[keep]
bsd = b[keep]

if ret_idx:
return asd, bsd, keep
else:
return asd, bsd


def populate_grid(x_discrete, y_discrete, keepd, toproc):
# Py_ssize_t is the proper C type for Python array indices.
cdef int iter_size = x_discrete.size
cdef Py_ssize_t ii
cdef uint32[:] x_view = x_discrete
cdef uint32[:] y_view = y_discrete
# Numpy uses uint8 internally to represent boolean arrays
cdef uint8[:] keepd_view = keepd
cdef uint8[:, :] toproc_view = toproc

for ii in range(iter_size):
# filter for overlapping events
xi = x_view[ii]
yi = y_view[ii]
if toproc_view[xi, yi]:
toproc_view[xi, yi] = 0
# include event
keepd_view[ii] = 1


def valid(a, b):
"""Check whether `a` and `b` are not inf or nan"""
return ~(np.isnan(a) | np.isinf(a) | np.isnan(b) | np.isinf(b))


def norm(a, ref1, ref2):
"""
Normalize `a` with min/max values of `ref1`, using all elements of
`ref1` where the `ref1` and `ref2` are not nan or inf"""
ref = ref1[valid(ref1, ref2)]
return (a-ref.min())/(ref.max()-ref.min())
def norm(a):
"""Normalize `a` with its min/max values"""
rmin = a.min()
rptp = a.max() - rmin
return (a - rmin) / rptp
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

setup(
ext_modules=[
Extension("dclab.downsampling",
sources=["dclab/downsampling.pyx"],
include_dirs=[np.get_include(), "_shared"]
),
Extension("dclab.external.skimage._shared.geometry",
sources=["dclab/external/skimage/_shared/geometry.pyx"],
include_dirs=[np.get_include()]
Expand Down
32 changes: 30 additions & 2 deletions tests/test_downsampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ def test_grid_nan():
b = np.arange(50, 150, dtype=float)
ads, bds, idx = downsampling.downsample_grid(a=a,
b=b,
samples=50,
samples=52,
ret_idx=True,
remove_invalid=False)
assert np.sum(idx) == 52
assert np.allclose(a[idx], ads, atol=1e-14, rtol=0, equal_nan=True)
assert np.allclose(b[idx], bds, atol=1e-14, rtol=0, equal_nan=True)
assert np.sum(np.isnan(ads)) == 1, "depends on random state"
assert np.sum(np.isnan(ads)) == 2, "2 more samples requested than non-nan"

ads2, bds2 = downsampling.downsample_grid(a=a,
b=b,
Expand All @@ -66,6 +67,33 @@ def test_grid_nan():
assert ads3.size == 50, "only 50 valid values"
assert np.all(ads3 == a[idx3])
assert np.all(bds3 == b[idx3])
assert np.all(bds3 == b[:50])


def test_grid_nan_keep_all():
a = np.arange(100, dtype=float)
a[50:] = np.nan
b = np.arange(50, 150, dtype=float)
ads, bds, idx = downsampling.downsample_grid(a=a,
b=b,
samples=0,
ret_idx=True,
remove_invalid=False)
assert np.allclose(a, ads, equal_nan=True)
assert np.allclose(b, bds, equal_nan=True)


def test_grid_nan_keep_all_except_invalid():
a = np.arange(100, dtype=float)
a[50:] = np.nan
b = np.arange(50, 150, dtype=float)
ads, bds, idx = downsampling.downsample_grid(a=a,
b=b,
samples=0,
ret_idx=True,
remove_invalid=True)
assert np.allclose(a[:50], ads, equal_nan=True)
assert np.allclose(b[:50], bds, equal_nan=True)


def test_rand_nan():
Expand Down

0 comments on commit 12c8326

Please sign in to comment.