Merge branch 'main' into facemap to update videodatainterface

catalystneuro · Apr 17, 2024 · ff03d31 · ff03d31
2 parents 8ac9c47 + 3273e51
commit ff03d31
Show file tree

Hide file tree

Showing 16 changed files with 382 additions and 89 deletions.
diff --git a/.github/workflows/dev-testing.yml b/.github/workflows/dev-testing.yml
@@ -51,8 +51,10 @@ jobs:
         run: pip install --no-cache-dir git+https://github.com/dandi/dandi-cli@master
       - name: Dev gallery - PyNWB
         run: pip install --no-cache-dir git+https://github.com/NeurodataWithoutBorders/pynwb@dev
+      - name: Dev gallery - ProbeInterface
+        run: pip install --no-cache-dir git+https://github.com/spikeinterface/probeinterface@main
       - name: Dev gallery - SpikeInterface
-        run: pip install --no-cache-dir git+https://github.com/spikeinterface/spikeinterface@main
+        run: pip install --no-cache-dir "spikeinterface[test_core] @ git+https://github.com/spikeinterface/spikeinterface@main"
       - name: Dev gallery - NEO
         run: pip install --no-cache-dir git+https://github.com/NeuralEnsemble/python-neo@master
       - name: Dev gallery - HDMF

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,12 +1,12 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.4.0
     hooks:
     -   id: black
         exclude: ^docs/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,13 +2,19 @@
 
 ### Deprecations
 * Removed `stream_id` as an argument from `IntanRecordingInterface` [PR #794](https://github.com/catalystneuro/neuroconv/pull/794)
+
+### Bug fixes
+* Fixed writing waveforms directly to file [PR #799](https://github.com/catalystneuro/neuroconv/pull/799)
+* Avoid in-place modification of the metadata in the `VideoInterface` and on neo tools. [PR #814](https://github.com/catalystneuro/neuroconv/pull/814)
 * Replaced `waveform_extractor.is_extension` with `waveform_extractor.has_extension`[PR #799](https://github.com/catalystneuro/neuroconv/pull/799)
 
 ### Features
+* Added `backend` control to the `make_or_load_nwbfile` helper method in `neuroconv.tools.nwb_helpers`. [PR #800](https://github.com/catalystneuro/neuroconv/pull/800)
+* Added `get_default_backend_configuration` method to all `DataInterface` classes. Also added HDF5 `backend` control to all standalone `.run_conversion(...)` methods for those interfaces. [PR #801](https://github.com/catalystneuro/neuroconv/pull/801)
 * Released the first official Docker images for the package on the GitHub Container Repository (GHCR). [PR #383](https://github.com/catalystneuro/neuroconv/pull/383)
 
-### Bug fixes
-* Fixed writing waveforms directly to file [PR #799](https://github.com/catalystneuro/neuroconv/pull/799)
+### Testing
+* Add general test for metadata in-place modification by interfaces. [PR #815](https://github.com/catalystneuro/neuroconv/pull/815)
 
 
 

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 [![PyPI version](https://badge.fury.io/py/neuroconv.svg)](https://badge.fury.io/py/neuroconv.svg)
-![Full Tests](https://github.com/catalystneuro/neuroconv/actions/workflows/testing.yml/badge.svg)
+![Daily Tests](https://github.com/catalystneuro/neuroconv/actions/workflows/dailies.yml/badge.svg)
 ![Auto-release](https://github.com/catalystneuro/neuroconv/actions/workflows/auto-publish.yml/badge.svg)
 [![codecov](https://codecov.io/github/catalystneuro/neuroconv/coverage.svg?branch=main)](https://codecov.io/github/catalystneuro/neuroconv?branch=main)
 [![documentation](https://readthedocs.org/projects/neuroconv/badge/?version=main)](https://neuroconv.readthedocs.io/en/main/)

diff --git a/src/neuroconv/basedatainterface.py b/src/neuroconv/basedatainterface.py
@@ -3,12 +3,19 @@
 import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Optional, Tuple, Union
+from typing import Literal, Optional, Tuple, Union
 
 from jsonschema.validators import validate
 from pynwb import NWBFile
 
-from .tools.nwb_helpers import make_nwbfile_from_metadata, make_or_load_nwbfile
+from .tools.nwb_helpers import (
+    HDF5BackendConfiguration,
+    ZarrBackendConfiguration,
+    configure_backend,
+    get_default_backend_configuration,
+    make_nwbfile_from_metadata,
+    make_or_load_nwbfile,
+)
 from .utils import (
     NWBMetaDataEncoder,
     get_schema_from_method_signature,
@@ -60,13 +67,43 @@ def validate_metadata(self, metadata: dict) -> None:
         decoded_metadata = json.loads(serialized_metadata)
         validate(instance=decoded_metadata, schema=self.get_metadata_schema())
 
-    def create_nwbfile(self, metadata=None, **conversion_options) -> NWBFile:
+    def create_nwbfile(self, metadata: Optional[dict] = None, **conversion_options) -> NWBFile:
+        """
+        Create and return an in-memory pynwb.NWBFile object with this interface's data added to it.
+
+        Parameters
+        ----------
+        metadata : dict, optional
+            Metadata dictionary with information used to create the NWBFile.
+        **conversion_options
+            Additional keyword arguments to pass to the `.add_to_nwbfile` method.
+
+        Returns
+        -------
+        nwbfile : pynwb.NWBFile
+            The in-memory object with this interface's data added to it.
+        """
+        if metadata is None:
+            metadata = self.get_metadata()
+
         nwbfile = make_nwbfile_from_metadata(metadata)
         self.add_to_nwbfile(nwbfile, metadata=metadata, **conversion_options)
         return nwbfile
 
     @abstractmethod
     def add_to_nwbfile(self, nwbfile: NWBFile, **conversion_options) -> None:
+        """
+        Define a protocol for mapping the data from this interface to NWB neurodata objects.
+
+        These neurodata objects should also be added to the in-memory pynwb.NWBFile object in this step.
+
+        Parameters
+        ----------
+        nwbfile : pynwb.NWBFile
+            The in-memory object to add the data to.
+        **conversion_options
+            Additional keyword arguments to pass to the `.add_to_nwbfile` method.
+        """
         raise NotImplementedError
 
     def run_conversion(
@@ -75,6 +112,11 @@ def run_conversion(
         nwbfile: Optional[NWBFile] = None,
         metadata: Optional[dict] = None,
         overwrite: bool = False,
+        # TODO: when all H5DataIO prewraps are gone, introduce Zarr safely
+        # backend: Union[Literal["hdf5", "zarr"]],
+        # backend_configuration: Optional[Union[HDF5BackendConfiguration, ZarrBackendConfiguration]] = None,
+        backend: Literal["hdf5"] = "hdf5",
+        backend_configuration: Optional[HDF5BackendConfiguration] = None,
         **conversion_options,
     ):
         """
@@ -92,13 +134,22 @@ def run_conversion(
         overwrite : bool, default: False
             Whether to overwrite the NWBFile if one exists at the nwbfile_path.
             The default is False (append mode).
+        backend : "hdf5" or a HDF5BackendConfiguration, default: "hdf5"
+            The type of backend to use when writing the file.
+        backend_configuration : HDF5BackendConfiguration, optional
+            The configuration model to use when configuring the datasets for this backend.
+            To customize, call the `.get_default_backend_configuration(...)` method, modify the returned
+            BackendConfiguration object, and pass that instead.
+            Otherwise, all datasets will use default configuration settings.
         """
         if nwbfile_path is None:
             warnings.warn(  # TODO: remove on or after 6/21/2024
                 "Using DataInterface.run_conversion without specifying nwbfile_path is deprecated. To create an "
                 "NWBFile object in memory, use DataInterface.create_nwbfile. To append to an existing NWBFile object,"
                 " use DataInterface.add_to_nwbfile."
             )
+        if backend_configuration is not None and nwbfile is None:
+            raise ValueError("When specifying a custom `backend_configuration`, you must also provide an `nwbfile`.")
 
         if metadata is None:
             metadata = self.get_metadata()
@@ -108,6 +159,37 @@ def run_conversion(
             nwbfile=nwbfile,
             metadata=metadata,
             overwrite=overwrite,
+            backend=backend,
             verbose=getattr(self, "verbose", False),
         ) as nwbfile_out:
-            self.add_to_nwbfile(nwbfile_out, metadata=metadata, **conversion_options)
+            if backend_configuration is None:
+                # In this case, assume the relevant data has already been added to the NWBFile
+                self.add_to_nwbfile(nwbfile_out, metadata=metadata, **conversion_options)
+
+                backend_configuration = self.get_default_backend_configuration(nwbfile=nwbfile_out, backend=backend)
+
+            configure_backend(nwbfile=nwbfile_out, backend_configuration=backend_configuration)
+
+    @staticmethod
+    def get_default_backend_configuration(
+        nwbfile: NWBFile,
+        # TODO: when all H5DataIO prewraps are gone, introduce Zarr safely
+        # backend: Union[Literal["hdf5", "zarr"]],
+        backend: Literal["hdf5"] = "hdf5",
+    ) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
+        """
+        Fill and return a default backend configuration to serve as a starting point for further customization.
+
+        Parameters
+        ----------
+        nwbfile : pynwb.NWBFile
+            The in-memory object with this interface's data already added to it.
+        backend : "hdf5" or "zarr", default: "hdf5"
+            The type of backend to use when creating the file.
+
+        Returns
+        -------
+        backend_configuration : HDF5BackendConfiguration or ZarrBackendConfiguration
+            The default configuration for the specified backend type.
+        """
+        return get_default_backend_configuration(nwbfile=nwbfile, backend=backend)
diff --git a/src/neuroconv/datainterfaces/behavior/video/videodatainterface.py b/src/neuroconv/datainterfaces/behavior/video/videodatainterface.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from pathlib import Path
 from typing import List, Literal, Optional
 from warnings import warn
@@ -303,9 +304,10 @@ def add_to_nwbfile(
 
         file_paths = self.source_data["file_paths"]
 
-        videos_metadata = metadata.get("Behavior", dict()).get("Videos", None)
+        # Be sure to copy metadata at this step to avoid mutating in-place
+        videos_metadata = deepcopy(metadata).get("Behavior", dict()).get("Videos", None)
         if videos_metadata is None:
-            videos_metadata = self.get_metadata()["Behavior"]["Videos"]
+            videos_metadata = deepcopy(self.get_metadata()["Behavior"]["Videos"])
 
         assert len(videos_metadata) == self._number_of_files, (
             "Incomplete metadata "

diff --git a/src/neuroconv/datainterfaces/ecephys/basesortingextractorinterface.py b/src/neuroconv/datainterfaces/ecephys/basesortingextractorinterface.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import List, Literal, Optional, Union
 
 import numpy as np
@@ -307,16 +308,17 @@ def add_to_nwbfile(
         """
         from ...tools.spikeinterface import add_sorting
 
+        metadata_copy = deepcopy(metadata)
         if write_ecephys_metadata:
-            self.add_channel_metadata_to_nwb(nwbfile=nwbfile, metadata=metadata)
+            self.add_channel_metadata_to_nwb(nwbfile=nwbfile, metadata=metadata_copy)
 
         if stub_test:
             sorting_extractor = self.subset_sorting()
         else:
             sorting_extractor = self.sorting_extractor
 
         property_descriptions = dict()
-        for metadata_column in metadata["Ecephys"].get("UnitProperties", []):
+        for metadata_column in metadata_copy["Ecephys"].get("UnitProperties", []):
             property_descriptions.update({metadata_column["name"]: metadata_column["description"]})
             for unit_id in sorting_extractor.get_unit_ids():
                 # Special condition for wrapping electrode group pointers to actual object ids rather than string names

diff --git a/src/neuroconv/tools/neo/neo.py b/src/neuroconv/tools/neo/neo.py
@@ -159,17 +159,16 @@ def add_icephys_electrode(neo_reader, nwbfile, metadata: dict = None):
                 ...
             ]
     """
+    metadata_copy = deepcopy(metadata) if metadata is not None else dict()
+
     assert isinstance(nwbfile, pynwb.NWBFile), "'nwbfile' should be of type pynwb.NWBFile"
 
     if len(nwbfile.devices) == 0:
         warnings.warn("When adding Icephys Electrode, no Devices were found on nwbfile. Creating a Device now...")
-        add_device_from_metadata(nwbfile=nwbfile, modality="Icephys", metadata=metadata)
-
-    if metadata is None:
-        metadata = dict()
+        add_device_from_metadata(nwbfile=nwbfile, modality="Icephys", metadata=metadata_copy)
 
-    if "Icephys" not in metadata:
-        metadata["Icephys"] = dict()
+    if "Icephys" not in metadata_copy:
+        metadata_copy["Icephys"] = dict()
 
     defaults = [
         dict(
@@ -180,15 +179,15 @@ def add_icephys_electrode(neo_reader, nwbfile, metadata: dict = None):
         for elec_id in range(get_number_of_electrodes(neo_reader))
     ]
 
-    if "Electrodes" not in metadata["Icephys"] or len(metadata["Icephys"]["Electrodes"]) == 0:
-        metadata["Icephys"]["Electrodes"] = deepcopy(defaults)
+    if "Electrodes" not in metadata_copy["Icephys"] or len(metadata_copy["Icephys"]["Electrodes"]) == 0:
+        metadata_copy["Icephys"]["Electrodes"] = defaults
 
     assert all(
-        [isinstance(x, dict) for x in metadata["Icephys"]["Electrodes"]]
+        [isinstance(x, dict) for x in metadata_copy["Icephys"]["Electrodes"]]
     ), "Expected metadata['Icephys']['Electrodes'] to be a list of dictionaries!"
 
     # Create Icephys electrode from metadata
-    for elec in metadata["Icephys"]["Electrodes"]:
+    for elec in metadata_copy["Icephys"]["Electrodes"]:
         if elec.get("name", defaults[0]["name"]) not in nwbfile.icephys_electrodes:
             device_name = elec.pop("device_name", None) or elec.pop("device", defaults[0]["device_name"])
             # elec.pop("device_name", 0)
@@ -286,8 +285,10 @@ def add_icephys_recordings(
     else:
         sessions_offset = 0
 
-    relative_session_start_time = metadata["Icephys"]["Sessions"][sessions_offset]["relative_session_start_time"]
-    session_stimulus_type = metadata["Icephys"]["Sessions"][sessions_offset]["stimulus_type"]
+    relative_session_start_time = deepcopy(
+        metadata["Icephys"]["Sessions"][sessions_offset]["relative_session_start_time"]
+    )
+    session_stimulus_type = deepcopy(metadata["Icephys"]["Sessions"][sessions_offset]["stimulus_type"])
 
     # Sequential icephys recordings
     simultaneous_recordings = list()

diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py
@@ -12,7 +12,9 @@ def get_default_backend_configuration(
     nwbfile: NWBFile, backend: Literal["hdf5", "zarr"]
 ) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
     """Fill a default backend configuration to serve as a starting point for further customization."""
-    from ..nwb_helpers import BACKEND_CONFIGURATIONS
+    from ..nwb_helpers import (
+        BACKEND_CONFIGURATIONS,  # Locally scoped to avoid circular import
+    )
 
     BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
 

diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py
@@ -260,9 +260,10 @@ def from_neurodata_object(cls, neurodata_object: Container, dataset_name: Litera
             )
             compression_method = "gzip"
         elif dtype == np.dtype("object"):  # Unclear what default chunking/compression should be for compound objects
-            chunk_shape = None
-            buffer_shape = None
-            compression_method = None
+            raise NotImplementedError(
+                f"Unable to create a `DatasetIOConfiguration` for the dataset at '{location_in_file}'"
+                f"for neurodata object '{neurodata_object}' of type '{type(neurodata_object)}'!"
+            )
 
         return cls(
             object_id=neurodata_object.object_id,

diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py
@@ -12,9 +12,19 @@
 def configure_backend(
     nwbfile: NWBFile, backend_configuration: Union[HDF5BackendConfiguration, ZarrBackendConfiguration]
 ) -> None:
-    """Configure all datasets specified in the `backend_configuration` with their appropriate DataIO and options."""
+    """
+    Configure all datasets specified in the `backend_configuration` with their appropriate DataIO and options.
+
+    Parameters
+    ----------
+    nwbfile : pynwb.NWBFile
+        The in-memory pynwb.NWBFile object to configure.
+    backend_configuration : "hdf5" or "zarr", default: "hdf5"
+        The configuration model to use when configuring the datasets for this backend.
+    """
     nwbfile_objects = nwbfile.objects
 
+    # Set all DataIO based on the configuration
     data_io_class = backend_configuration.data_io_class
     for dataset_configuration in backend_configuration.dataset_configurations.values():
         object_id = dataset_configuration.object_id
@@ -23,21 +33,23 @@ def configure_backend(
 
         # TODO: update buffer shape in iterator, if present
 
-        nwbfile_object = nwbfile_objects[object_id]
-        is_dataset_linked = isinstance(nwbfile_object.fields.get(dataset_name), TimeSeries)
+        neurodata_object = nwbfile_objects[object_id]
+        is_dataset_linked = isinstance(neurodata_object.fields.get(dataset_name), TimeSeries)
+
         # Table columns
-        if isinstance(nwbfile_object, Data):
-            nwbfile_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs)
+        if isinstance(neurodata_object, Data):
+            neurodata_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs)
         # TimeSeries data or timestamps
-        elif isinstance(nwbfile_object, TimeSeries) and not is_dataset_linked:
-            nwbfile_object.set_data_io(
+        elif isinstance(neurodata_object, TimeSeries) and not is_dataset_linked:
+            neurodata_object.set_data_io(
                 dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs
             )
         # Skip the setting of a DataIO when target dataset is a link (assume it will be found in parent)
-        elif isinstance(nwbfile_object, TimeSeries) and is_dataset_linked:
+        elif isinstance(neurodata_object, TimeSeries) and is_dataset_linked:
             continue
         # Strictly speaking, it would be odd if a backend_configuration led to this, but might as well be safe
         else:
             raise NotImplementedError(
-                f"Unsupported object type {type(nwbfile_object)} for backend configuration of {nwbfile_object.name}!"
+                f"Unsupported object type {type(neurodata_object)} for backend configuration "
+                f"of {neurodata_object.name}!"
             )