From 1c7895f9e0376aaf3aea3bd4a68ca99a79788c70 Mon Sep 17 00:00:00 2001 From: Ben Dichter Date: Mon, 10 Jul 2023 02:32:09 -0400 Subject: [PATCH] add can_read method to HDMFIO and HDF5IO (#875) Co-authored-by: Ryan Ly --- CHANGELOG.md | 4 ++-- src/hdmf/backends/hdf5/h5tools.py | 29 ++++++++++++++++++++--------- src/hdmf/backends/io.py | 7 +++++++ tests/unit/test_io_hdf5_h5tools.py | 14 ++++++++++++++ 4 files changed, 43 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbbf3fb8e..fbb687ce9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,10 @@ # HDMF Changelog -## HMDF 3.7.0 (Upcoming) +## HDMF 3.7.0 (Upcoming) ### New features and minor improvements - Updated `ExternalResources` to have EntityKeyTable with updated tests/documentation and minor bug fix to ObjectKeyTable. @mavaylon1 [#872](https://github.com/hdmf-dev/hdmf/pull/872) +- Added abstract static method `HDMFIO.can_read()` and concrete static method `HDF5IO.can_read()`. @bendichter [#875](https://github.com/hdmf-dev/hdmf/pull/875) - Added warning for `DynamicTableRegion` links that are not added to the same parent as the original container object. @mavaylon1 [#891](https://github.com/hdmf-dev/hdmf/pull/891) - Added the `TermSet` class along with integrated validation methods for any child of `AbstractContainer`, e.g., `VectorData`, `Data`, `DynamicTable`. @mavaylon1 [#880](https://github.com/hdmf-dev/hdmf/pull/880) - Allow for `datetime.date` to be used instead of `datetime.datetime`. @bendichter [#874](https://github.com/hdmf-dev/hdmf/pull/874) @@ -11,7 +12,6 @@ - Dropped Python 3.7 support. @rly [#897](https://github.com/hdmf-dev/hdmf/pull/897) ### Documentation and tutorial enhancements: - - Added tutorial for the new `TermSet` class @mavaylon1 [#880](https://github.com/hdmf-dev/hdmf/pull/880) ## Bug fixes diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 547d096d9..b331559bf 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -36,6 +36,17 @@ class HDF5IO(HDMFIO): __ns_spec_path = 'namespace' # path to the namespace dataset within a namespace group + @staticmethod + def can_read(path): + """Determines whether a given path is readable by the HDF5IO class""" + if not os.path.isfile(path): + return False + try: + with h5py.File(path, "r"): + return True + except IOError: + return False + @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, {'name': 'mode', 'type': str, 'doc': ('the mode to open the HDF5 file with, one of ("w", "r", "r+", "a", "w-", "x"). ' @@ -82,8 +93,8 @@ def __init__(self, **kwargs): self.__file = file_obj super().__init__(manager, source=path, external_resources_path=external_resources_path) # NOTE: source is not set if path is None and file_obj is passed - self.__built = dict() # keep track of each builder for each dataset/group/link for each file - self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder + self.__built = dict() # keep track of each builder for each dataset/group/link for each file + self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder self.__ref_queue = deque() # a queue of the references that need to be added self.__dci_queue = HDF5IODataChunkIteratorQueue() # a queue of DataChunkIterators that need to be exhausted ObjectMapper.no_convert(Dataset) @@ -603,7 +614,7 @@ def __read_group(self, h5obj, name=None, ignore=set()): builder = self.__read_dataset(target_obj, builder_name) else: builder = self.__read_group(target_obj, builder_name, ignore=ignore) - self.__set_built(sub_h5obj.file.filename, target_obj.id, builder) + self.__set_built(sub_h5obj.file.filename, target_obj.id, builder) link_builder = LinkBuilder(builder=builder, name=k, source=os.path.abspath(h5obj.file.filename)) link_builder.location = h5obj.name self.__set_written(link_builder) @@ -648,7 +659,7 @@ def __read_dataset(self, h5obj, name=None): name = str(os.path.basename(h5obj.name)) kwargs['source'] = os.path.abspath(h5obj.file.filename) ndims = len(h5obj.shape) - if ndims == 0: # read scalar + if ndims == 0: # read scalar scalar = h5obj[()] if isinstance(scalar, bytes): scalar = scalar.decode('UTF-8') @@ -678,7 +689,7 @@ def __read_dataset(self, h5obj, name=None): elif isinstance(elem1, Reference): d = BuilderH5ReferenceDataset(h5obj, self) kwargs['dtype'] = d.dtype - elif h5obj.dtype.kind == 'V': # table / compound data type + elif h5obj.dtype.kind == 'V': # table / compound data type cpd_dt = h5obj.dtype ref_cols = [check_dtype(ref=cpd_dt[i]) or check_dtype(vlen=cpd_dt[i]) for i in range(len(cpd_dt))] d = BuilderH5TableDataset(h5obj, self, ref_cols) @@ -708,7 +719,7 @@ def __compound_dtype_to_list(cls, h5obj_dtype, dset_dtype): def __read_attrs(self, h5obj): ret = dict() for k, v in h5obj.attrs.items(): - if k == SPEC_LOC_ATTR: # ignore cached spec + if k == SPEC_LOC_ATTR: # ignore cached spec continue if isinstance(v, RegionReference): raise ValueError("cannot read region reference attributes yet") @@ -925,14 +936,14 @@ def set_attributes(self, **kwargs): self.logger.debug("Setting %s '%s' attribute '%s' to %s" % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) obj.attrs[key] = value - elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference + elif isinstance(value, (Container, Builder, ReferenceBuilder)): # a reference self.__queue_ref(self._make_attr_ref_filler(obj, key, value)) else: self.logger.debug("Setting %s '%s' attribute '%s' to %s" % (obj.__class__.__name__, obj.name, key, value.__class__.__name__)) if isinstance(value, np.ndarray) and value.dtype.kind == 'U': value = np.array(value, dtype=H5_TEXT) - obj.attrs[key] = value # a regular scalar + obj.attrs[key] = value # a regular scalar except Exception as e: msg = "unable to write attribute '%s' on object '%s'" % (key, obj.name) raise RuntimeError(msg) from e @@ -1079,7 +1090,7 @@ def write_dataset(self, **kwargs): # noqa: C901 name = builder.name data = builder.data dataio = None - options = dict() # dict with additional + options = dict() # dict with additional if isinstance(data, H5DataIO): options['io_settings'] = data.io_settings dataio = data diff --git a/src/hdmf/backends/io.py b/src/hdmf/backends/io.py index 4bf4f8ccf..5b1662cca 100644 --- a/src/hdmf/backends/io.py +++ b/src/hdmf/backends/io.py @@ -10,6 +10,13 @@ class HDMFIO(metaclass=ABCMeta): + + @staticmethod + @abstractmethod + def can_read(path): + """Determines whether a given path is readable by this HDMFIO class""" + pass + @docval({'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, {"name": "source", "type": (str, Path), diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 0585d2b99..d2ebbbc34 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -3228,6 +3228,10 @@ def test_non_manager_container(self): class OtherIO(HDMFIO): + @staticmethod + def can_read(path): + pass + def read_builder(self): pass @@ -3257,6 +3261,10 @@ def test_non_HDF5_src_link_data_true(self): class OtherIO(HDMFIO): + @staticmethod + def can_read(path): + pass + def __init__(self, manager): super().__init__(manager=manager) @@ -3570,3 +3578,9 @@ def test_dataio_shape_then_data(self): dataio = H5DataIO(shape=(10, 10), dtype=int) with self.assertRaisesRegex(ValueError, "Setting data when dtype and shape are not None is not supported"): dataio.data = list() + + +def test_hdf5io_can_read(): + assert not HDF5IO.can_read("not_a_file") + assert HDF5IO.can_read("tests/unit/back_compat_tests/1.0.5.h5") + assert not HDF5IO.can_read(__file__) # this file is not an HDF5 file