diff --git a/CHANGELOG b/CHANGELOG index 196aea30..86b85198 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,9 @@ - enh: slight improvement of managing manual indices in hierarchy children - enh: added dtype properties for contour and trace events - enh: ensure all feature data objects have the dtype property + - enh: globally use chunk sizes of ~1MiB when writing HDF5 data + (minor speedup since previously a chunk size of 100 events was used + for images and scalar features were written in one big chunk) - ref: new submodule for hierarchy format 0.56.3 - fix: regression missing check for basin availability diff --git a/dclab/rtdc_dataset/export.py b/dclab/rtdc_dataset/export.py index c9e0d5b2..1decdde5 100644 --- a/dclab/rtdc_dataset/export.py +++ b/dclab/rtdc_dataset/export.py @@ -25,7 +25,7 @@ from .. import definitions as dfn from .._version import version -from .writer import RTDCWriter, CHUNK_SIZE +from .writer import RTDCWriter class LimitingExportSizeWarning(UserWarning): @@ -411,14 +411,16 @@ def yield_filtered_array_stacks(data, indices): The dtype of the returned chunks is determined by the first item in `data`. """ + chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:], + item_dtype=data.dtype) + chunk_size = chunk_shape[0] # assemble filtered image stacks - data0 = data[0] - chunk_shape = tuple([CHUNK_SIZE] + list(data0.shape)) - chunk = np.zeros(chunk_shape, dtype=data0.dtype) + chunk = np.zeros(chunk_shape, dtype=data.dtype) + jj = 0 for ii in indices: chunk[jj] = data[ii] - if (jj + 1) % CHUNK_SIZE == 0: + if (jj + 1) % chunk_size == 0: jj = 0 yield chunk else: diff --git a/dclab/rtdc_dataset/writer.py b/dclab/rtdc_dataset/writer.py index e0e2ee3f..b81853f9 100644 --- a/dclab/rtdc_dataset/writer.py +++ b/dclab/rtdc_dataset/writer.py @@ -18,9 +18,12 @@ from .feat_anc_plugin import PlugInFeature -#: Chunk size for storing HDF5 data +#: DEPRECATED (use `CHUNK_SIZE_BYTES` instead) CHUNK_SIZE = 100 +#: Chunks size in bytes for storing HDF5 datasets +CHUNK_SIZE_BYTES = 1024**2 # 1MiB + class RTDCWriter: def __init__(self, @@ -100,6 +103,24 @@ def __exit__(self, type, value, tb): # This is guaranteed to run if any exception is raised. self.close() + @staticmethod + def get_best_nd_chunks(item_shape, item_dtype=np.float64): + """Return best chunks for HDF5 datasets + + Chunking has performance implications. It’s recommended to keep the + total size of dataset chunks between 10 KiB and 1 MiB. This number + defines the maximum chunk size as well as half the maximum cache + size for each dataset. + """ + # Note that `np.prod(()) == 1` + event_size = np.prod(item_shape) * np.dtype(item_dtype).itemsize + + chunk_size = CHUNK_SIZE_BYTES / event_size + # Set minimum chunk size to 10 so that we can have at least some + # compression performance. + chunk_size_int = max(10, int(np.floor(chunk_size))) + return tuple([chunk_size_int] + list(item_shape)) + def close(self): """Close the underlying HDF5 file if a path was given during init""" if self.owns_path: @@ -629,12 +650,9 @@ def write_ndarray(self, group, name, data, dtype=None): (defaults to `data.dtype`) """ if name not in group: + chunks = self.get_best_nd_chunks(item_shape=data.shape[1:], + item_dtype=data.dtype) maxshape = tuple([None] + list(data.shape)[1:]) - if len(data.shape) == 1: - # no (or minimal) chunking for scalar data - chunks = max(len(data), CHUNK_SIZE) - else: - chunks = tuple([CHUNK_SIZE] + list(data.shape)[1:]) dset = group.create_dataset( name, shape=data.shape, @@ -672,17 +690,18 @@ def write_ndarray(self, group, name, data, dtype=None): mean = np.nanmean(dset) dset.attrs["mean"] = mean else: + chunk_size = dset.chunks[0] # populate higher-dimensional data in chunks # (reduces file size, memory usage, and saves time) - num_chunks = len(data) // CHUNK_SIZE + num_chunks = len(data) // chunk_size for ii in range(num_chunks): - start = ii * CHUNK_SIZE - stop = start + CHUNK_SIZE + start = ii * chunk_size + stop = start + chunk_size dset[offset+start:offset+stop] = data[start:stop] # write remainder (if applicable) - num_remain = len(data) % CHUNK_SIZE + num_remain = len(data) % chunk_size if num_remain: - start_e = num_chunks*CHUNK_SIZE + start_e = num_chunks * chunk_size stop_e = start_e + num_remain dset[offset+start_e:offset+stop_e] = data[start_e:stop_e] return dset