Skip to content

Commit

Permalink
enh: globally use chunk sizes of ~1MiB when writing HDF5 data
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed Jan 4, 2024
1 parent fcf9d8c commit 537fb58
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
- enh: slight improvement of managing manual indices in hierarchy children
- enh: added dtype properties for contour and trace events
- enh: ensure all feature data objects have the dtype property
- enh: globally use chunk sizes of ~1MiB when writing HDF5 data
(minor speedup since previously a chunk size of 100 events was used
for images and scalar features were written in one big chunk)
- ref: new submodule for hierarchy format
0.56.3
- fix: regression missing check for basin availability
Expand Down
12 changes: 7 additions & 5 deletions dclab/rtdc_dataset/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from .. import definitions as dfn
from .._version import version
from .writer import RTDCWriter, CHUNK_SIZE
from .writer import RTDCWriter


class LimitingExportSizeWarning(UserWarning):
Expand Down Expand Up @@ -411,14 +411,16 @@ def yield_filtered_array_stacks(data, indices):
The dtype of the returned chunks is determined by the first
item in `data`.
"""
chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:],
item_dtype=data.dtype)
chunk_size = chunk_shape[0]
# assemble filtered image stacks
data0 = data[0]
chunk_shape = tuple([CHUNK_SIZE] + list(data0.shape))
chunk = np.zeros(chunk_shape, dtype=data0.dtype)
chunk = np.zeros(chunk_shape, dtype=data.dtype)

jj = 0
for ii in indices:
chunk[jj] = data[ii]
if (jj + 1) % CHUNK_SIZE == 0:
if (jj + 1) % chunk_size == 0:
jj = 0
yield chunk
else:
Expand Down
41 changes: 30 additions & 11 deletions dclab/rtdc_dataset/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@

from .feat_anc_plugin import PlugInFeature

#: Chunk size for storing HDF5 data
#: DEPRECATED (use `CHUNK_SIZE_BYTES` instead)
CHUNK_SIZE = 100

#: Chunks size in bytes for storing HDF5 datasets
CHUNK_SIZE_BYTES = 1024**2 # 1MiB


class RTDCWriter:
def __init__(self,
Expand Down Expand Up @@ -100,6 +103,24 @@ def __exit__(self, type, value, tb):
# This is guaranteed to run if any exception is raised.
self.close()

@staticmethod
def get_best_nd_chunks(item_shape, item_dtype=np.float64):
"""Return best chunks for HDF5 datasets
Chunking has performance implications. It’s recommended to keep the
total size of dataset chunks between 10 KiB and 1 MiB. This number
defines the maximum chunk size as well as half the maximum cache
size for each dataset.
"""
# Note that `np.prod(()) == 1`
event_size = np.prod(item_shape) * np.dtype(item_dtype).itemsize

chunk_size = CHUNK_SIZE_BYTES / event_size
# Set minimum chunk size to 10 so that we can have at least some
# compression performance.
chunk_size_int = max(10, int(np.floor(chunk_size)))
return tuple([chunk_size_int] + list(item_shape))

def close(self):
"""Close the underlying HDF5 file if a path was given during init"""
if self.owns_path:
Expand Down Expand Up @@ -629,12 +650,9 @@ def write_ndarray(self, group, name, data, dtype=None):
(defaults to `data.dtype`)
"""
if name not in group:
chunks = self.get_best_nd_chunks(item_shape=data.shape[1:],
item_dtype=data.dtype)
maxshape = tuple([None] + list(data.shape)[1:])
if len(data.shape) == 1:
# no (or minimal) chunking for scalar data
chunks = max(len(data), CHUNK_SIZE)
else:
chunks = tuple([CHUNK_SIZE] + list(data.shape)[1:])
dset = group.create_dataset(
name,
shape=data.shape,
Expand Down Expand Up @@ -672,17 +690,18 @@ def write_ndarray(self, group, name, data, dtype=None):
mean = np.nanmean(dset)
dset.attrs["mean"] = mean
else:
chunk_size = dset.chunks[0]
# populate higher-dimensional data in chunks
# (reduces file size, memory usage, and saves time)
num_chunks = len(data) // CHUNK_SIZE
num_chunks = len(data) // chunk_size
for ii in range(num_chunks):
start = ii * CHUNK_SIZE
stop = start + CHUNK_SIZE
start = ii * chunk_size
stop = start + chunk_size
dset[offset+start:offset+stop] = data[start:stop]
# write remainder (if applicable)
num_remain = len(data) % CHUNK_SIZE
num_remain = len(data) % chunk_size
if num_remain:
start_e = num_chunks*CHUNK_SIZE
start_e = num_chunks * chunk_size
stop_e = start_e + num_remain
dset[offset+start_e:offset+stop_e] = data[start_e:stop_e]
return dset
Expand Down

0 comments on commit 537fb58

Please sign in to comment.