enh: globally use chunk sizes of ~1MiB when writing HDF5 data

DC-analysis · Jan 4, 2024 · 537fb58 · 537fb58
1 parent fcf9d8c
commit 537fb58
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 16 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -6,6 +6,9 @@
  - enh: slight improvement of managing manual indices in hierarchy children
  - enh: added dtype properties for contour and trace events
  - enh: ensure all feature data objects have the dtype property
+ - enh: globally use chunk sizes of ~1MiB when writing HDF5 data
+   (minor speedup since previously a chunk size of 100 events was used
+   for images and scalar features were written in one big chunk)
  - ref: new submodule for hierarchy format
 0.56.3
  - fix: regression missing check for basin availability

diff --git a/dclab/rtdc_dataset/export.py b/dclab/rtdc_dataset/export.py
@@ -25,7 +25,7 @@
 
 from .. import definitions as dfn
 from .._version import version
-from .writer import RTDCWriter, CHUNK_SIZE
+from .writer import RTDCWriter
 
 
 class LimitingExportSizeWarning(UserWarning):
@@ -411,14 +411,16 @@ def yield_filtered_array_stacks(data, indices):
     The dtype of the returned chunks is determined by the first
     item in `data`.
     """
+    chunk_shape = RTDCWriter.get_best_nd_chunks(item_shape=data.shape[1:],
+                                                item_dtype=data.dtype)
+    chunk_size = chunk_shape[0]
     # assemble filtered image stacks
-    data0 = data[0]
-    chunk_shape = tuple([CHUNK_SIZE] + list(data0.shape))
-    chunk = np.zeros(chunk_shape, dtype=data0.dtype)
+    chunk = np.zeros(chunk_shape, dtype=data.dtype)
+
     jj = 0
     for ii in indices:
         chunk[jj] = data[ii]
-        if (jj + 1) % CHUNK_SIZE == 0:
+        if (jj + 1) % chunk_size == 0:
             jj = 0
             yield chunk
         else:

diff --git a/dclab/rtdc_dataset/writer.py b/dclab/rtdc_dataset/writer.py
@@ -18,9 +18,12 @@
 
 from .feat_anc_plugin import PlugInFeature
 
-#: Chunk size for storing HDF5 data
+#: DEPRECATED (use `CHUNK_SIZE_BYTES` instead)
 CHUNK_SIZE = 100
 
+#: Chunks size in bytes for storing HDF5 datasets
+CHUNK_SIZE_BYTES = 1024**2  # 1MiB
+
 
 class RTDCWriter:
     def __init__(self,
@@ -100,6 +103,24 @@ def __exit__(self, type, value, tb):
             # This is guaranteed to run if any exception is raised.
             self.close()
 
+    @staticmethod
+    def get_best_nd_chunks(item_shape, item_dtype=np.float64):
+        """Return best chunks for HDF5 datasets
+
+        Chunking has performance implications. It’s recommended to keep the
+        total size of dataset chunks between 10 KiB and 1 MiB. This number
+        defines the maximum chunk size as well as half the maximum cache
+        size for each dataset.
+        """
+        # Note that `np.prod(()) == 1`
+        event_size = np.prod(item_shape) * np.dtype(item_dtype).itemsize
+
+        chunk_size = CHUNK_SIZE_BYTES / event_size
+        # Set minimum chunk size to 10 so that we can have at least some
+        # compression performance.
+        chunk_size_int = max(10, int(np.floor(chunk_size)))
+        return tuple([chunk_size_int] + list(item_shape))
+
     def close(self):
         """Close the underlying HDF5 file if a path was given during init"""
         if self.owns_path:
@@ -629,12 +650,9 @@ def write_ndarray(self, group, name, data, dtype=None):
             (defaults to `data.dtype`)
         """
         if name not in group:
+            chunks = self.get_best_nd_chunks(item_shape=data.shape[1:],
+                                             item_dtype=data.dtype)
             maxshape = tuple([None] + list(data.shape)[1:])
-            if len(data.shape) == 1:
-                # no (or minimal) chunking for scalar data
-                chunks = max(len(data), CHUNK_SIZE)
-            else:
-                chunks = tuple([CHUNK_SIZE] + list(data.shape)[1:])
             dset = group.create_dataset(
                 name,
                 shape=data.shape,
@@ -672,17 +690,18 @@ def write_ndarray(self, group, name, data, dtype=None):
                 mean = np.nanmean(dset)
             dset.attrs["mean"] = mean
         else:
+            chunk_size = dset.chunks[0]
             # populate higher-dimensional data in chunks
             # (reduces file size, memory usage, and saves time)
-            num_chunks = len(data) // CHUNK_SIZE
+            num_chunks = len(data) // chunk_size
             for ii in range(num_chunks):
-                start = ii * CHUNK_SIZE
-                stop = start + CHUNK_SIZE
+                start = ii * chunk_size
+                stop = start + chunk_size
                 dset[offset+start:offset+stop] = data[start:stop]
             # write remainder (if applicable)
-            num_remain = len(data) % CHUNK_SIZE
+            num_remain = len(data) % chunk_size
             if num_remain:
-                start_e = num_chunks*CHUNK_SIZE
+                start_e = num_chunks * chunk_size
                 stop_e = start_e + num_remain
                 dset[offset+start_e:offset+stop_e] = data[start_e:stop_e]
         return dset