diff --git a/.circleci/config.yml b/.circleci/config.yml index 811bcb18a..7eac6cf0b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -20,6 +20,8 @@ references: ci-steps: &ci-steps steps: - checkout + - run: git submodule sync + - run: git submodule update --init - run: <<: *initialize-venv - run: @@ -65,6 +67,8 @@ references: conda-steps: &conda-steps steps: - checkout + - run: git submodule sync + - run: git submodule update --init - run: name: Configure conda command: | @@ -89,6 +93,8 @@ references: gallery-steps: &gallery-steps steps: - checkout + - run: git submodule sync + - run: git submodule update --init - restore_cache: keys: - ophys-data-cache diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..6d4875fc1 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/hdmf/common/hdmf-common-schema"] + path = src/hdmf/common/hdmf-common-schema + url = https://github.com/hdmf-dev/hdmf-common-schema.git diff --git a/Makefile b/Makefile index 14bcd8418..5c4d20cc1 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ flake: $(FLAKE) tests/ checkpdb: - find {src,tests} -name "[a-z]*.py" -exec grep -Hn -e pdb -e print -e breakpoint {} \; + find {src,tests} -name "[a-z]*.py" -exec grep -Hn -e pdb -e print\( -e breakpoint {} \; devtest: $(PYTHON) test.py diff --git a/requirements.txt b/requirements.txt index d31d1c642..7a552eb3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ chardet==3.0.4 h5py==2.9.0 numpy==1.17.0 +scipy==1.3.1 pandas==0.25.0 python-dateutil==2.8.0 ruamel.yaml==0.16.0 diff --git a/setup.py b/setup.py index 564965043..15acc3fcb 100755 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ pkgs = find_packages('src', exclude=['data']) print('found these packages:', pkgs) -schema_dir = 'data' +schema_dir = 'common/hdmf-common-schema/common' setup_args = { 'name': 'hdmf', @@ -35,6 +35,7 @@ ], 'packages': pkgs, 'package_dir': {'': 'src'}, + 'package_data': {'hdmf': ["%s/*.yaml" % schema_dir, "%s/*.json" % schema_dir]}, 'classifiers': [ "Programming Language :: Python", "Programming Language :: Python :: 3.5", diff --git a/src/hdmf/__init__.py b/src/hdmf/__init__.py index d5c42bd4e..4dbdb7a71 100644 --- a/src/hdmf/__init__.py +++ b/src/hdmf/__init__.py @@ -1,8 +1,8 @@ +from . import query # noqa: F401 from .container import Container, Data, DataRegion from .utils import docval, getargs from .data_utils import ListSlicer from .backends.hdf5.h5_utils import H5RegionSlicer, H5Dataset -from . import query @docval({'name': 'dataset', 'type': None, 'doc': 'the HDF5 dataset to slice'}, diff --git a/src/hdmf/backends/hdf5/h5_utils.py b/src/hdmf/backends/hdf5/h5_utils.py index 71f66540c..574e7a0bb 100644 --- a/src/hdmf/backends/hdf5/h5_utils.py +++ b/src/hdmf/backends/hdf5/h5_utils.py @@ -160,6 +160,7 @@ def __init__(self, **kwargs): self.__group = getargs('group', kwargs) super_kwargs = {'source': "%s:%s" % (os.path.abspath(self.__group.file.name), self.__group.name)} call_docval_func(super(H5SpecReader, self).__init__, super_kwargs) + self.__cache = None def __read(self, path): s = self.__group[path][()] @@ -177,8 +178,9 @@ def read_spec(self, spec_path): return self.__read(spec_path) def read_namespace(self, ns_path): - ret = self.__read(ns_path) - ret = ret['namespaces'] + if self.__cache is None: + self.__cache = self.__read(ns_path) + ret = self.__cache['namespaces'] return ret diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 64f93e398..30e462371 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -104,15 +104,59 @@ def load_namespaces(cls, namespace_catalog, path, namespaces=None): if namespaces is None: namespaces = list(spec_group.keys()) + readers = dict() + deps = dict() for ns in namespaces: ns_group = spec_group[ns] latest_version = list(ns_group.keys())[-1] ns_group = ns_group[latest_version] reader = H5SpecReader(ns_group) + readers[ns] = reader + for spec_ns in reader.read_namespace('namespace'): + deps[ns] = list() + for s in spec_ns['schema']: + dep = s.get('namespace') + if dep is not None: + deps[ns].append(dep) + + order = cls._order_deps(deps) + for ns in order: + reader = readers[ns] d.update(namespace_catalog.load_namespaces('namespace', reader=reader)) return d + @classmethod + def _order_deps(cls, deps): + """ + Order namespaces according to dependency for loading into a NamespaceCatalog + + Args: + deps (dict): a dictionary that maps a namespace name to a list of name of + the namespaces on which the the namespace is directly dependent + Example: {'a': ['b', 'c'], 'b': ['d'], c: ['d'], 'd': []} + Expected output: ['d', 'b', 'c', 'a'] + """ + order = list() + keys = list(deps.keys()) + deps = dict(deps) + for k in keys: + if k in deps: + cls.__order_deps_aux(order, deps, k) + return order + + @classmethod + def __order_deps_aux(cls, order, deps, key): + """ + A recursive helper function for _order_deps + """ + if key not in deps: + return + subdeps = deps.pop(key) + for subk in subdeps: + cls.__order_deps_aux(order, deps, subk) + order.append(key) + @classmethod def __convert_namespace(cls, ns_catalog, namespace): ns = ns_catalog.get_namespace(namespace) diff --git a/src/hdmf/build/map.py b/src/hdmf/build/map.py index f8ce8d790..9b5561277 100644 --- a/src/hdmf/build/map.py +++ b/src/hdmf/build/map.py @@ -3,12 +3,12 @@ import numpy as np import warnings from collections import OrderedDict -from copy import copy +from copy import copy, deepcopy from datetime import datetime from six import with_metaclass, raise_from, text_type, binary_type, integer_types from ..utils import docval, getargs, ExtenderMeta, get_docval, fmt_docval_args, call_docval_func -from ..container import Container, Data, DataRegion +from ..container import AbstractContainer, Container, Data, DataRegion from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, NAME_WILDCARD, NamespaceCatalog, RefSpec,\ SpecReader from ..data_utils import DataIO, AbstractDataChunkIterator @@ -90,7 +90,7 @@ def __repr__(self): class BuildManager(object): """ - A class for managing builds of Containers + A class for managing builds of AbstractContainers """ def __init__(self, type_map): @@ -106,14 +106,15 @@ def namespace_catalog(self): def type_map(self): return self.__type_map - @docval({"name": "object", "type": (BaseBuilder, Container), "doc": "the container or builder to get a proxy for"}, + @docval({"name": "object", "type": (BaseBuilder, AbstractContainer), + "doc": "the container or builder to get a proxy for"}, {"name": "source", "type": str, "doc": "the source of container being built i.e. file path", 'default': None}) def get_proxy(self, **kwargs): obj = getargs('object', kwargs) if isinstance(obj, BaseBuilder): return self.__get_proxy_builder(obj) - elif isinstance(obj, Container): + elif isinstance(obj, AbstractContainer): return self.__get_proxy_container(obj) def __get_proxy_builder(self, builder): @@ -141,13 +142,13 @@ def __get_proxy_container(self, container): loc = "/".join(reversed(stack)) return Proxy(self, container.container_source, loc, ns, dt) - @docval({"name": "container", "type": Container, "doc": "the container to convert to a Builder"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the container to convert to a Builder"}, {"name": "source", "type": str, "doc": "the source of container being built i.e. file path", 'default': None}, {"name": "spec_ext", "type": BaseStorageSpec, "doc": "a spec that further refines the base specificatoin", 'default': None}) def build(self, **kwargs): - """ Build the GroupBuilder for the given Container""" + """ Build the GroupBuilder for the given AbstractContainer""" container = getargs('container', kwargs) container_id = self.__conthash__(container) result = self.__builders.get(container_id) @@ -170,11 +171,11 @@ def build(self, **kwargs): result = self.__type_map.build(container, self, builder=result, source=source, spec_ext=spec_ext) return result - @docval({"name": "container", "type": Container, "doc": "the Container to save as prebuilt"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the AbstractContainer to save as prebuilt"}, {'name': 'builder', 'type': (DatasetBuilder, GroupBuilder), 'doc': 'the Builder representation of the given container'}) def prebuilt(self, **kwargs): - ''' Save the Builder for a given Container for future use ''' + ''' Save the Builder for a given AbstractContainer for future use ''' container, builder = getargs('container', 'builder', kwargs) container_id = self.__conthash__(container) self.__builders[container_id] = builder @@ -188,9 +189,9 @@ def __bldrhash__(self, obj): return id(obj) @docval({'name': 'builder', 'type': (DatasetBuilder, GroupBuilder), - 'doc': 'the builder to construct the Container from'}) + 'doc': 'the builder to construct the AbstractContainer from'}) def construct(self, **kwargs): - """ Construct the Container represented by the given builder """ + """ Construct the AbstractContainer represented by the given builder """ builder = getargs('builder', kwargs) if isinstance(builder, LinkBuilder): builder = builder.target @@ -240,7 +241,7 @@ def get_cls(self, **kwargs): builder = getargs('builder', kwargs) return self.__type_map.get_cls(builder) - @docval({"name": "container", "type": Container, "doc": "the container to convert to a Builder"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the container to convert to a Builder"}, returns='The name a Builder should be given when building this container', rtype=str) def get_builder_name(self, **kwargs): ''' Get the name a Builder should be given ''' @@ -285,9 +286,9 @@ def _constructor_arg(**kwargs): '''Decorator to override the default mapping scheme for a given constructor argument. Decorate ObjectMapper methods with this function when extending ObjectMapper to override the default - scheme for mapping between Container and Builder objects. The decorated method should accept as its + scheme for mapping between AbstractContainer and Builder objects. The decorated method should accept as its first argument the Builder object that is being mapped. The method should return the value to be passed - to the target Container class constructor argument given by *name*. + to the target AbstractContainer class constructor argument given by *name*. ''' name = getargs('name', kwargs) @@ -306,8 +307,8 @@ def _object_attr(**kwargs): '''Decorator to override the default mapping scheme for a given object attribute. Decorate ObjectMapper methods with this function when extending ObjectMapper to override the default - scheme for mapping between Container and Builder objects. The decorated method should accept as its - first argument the Container object that is being mapped. The method should return the child Builder + scheme for mapping between AbstractContainer and Builder objects. The decorated method should accept as its + first argument the AbstractContainer object that is being mapped. The method should return the child Builder object (or scalar if the object attribute corresponds to an AttributeSpec) that represents the attribute given by *name*. ''' @@ -344,7 +345,7 @@ def _ascii(s): class ObjectMapper(with_metaclass(ExtenderMeta, object)): - '''A class for mapping between Spec objects and Container attributes + '''A class for mapping between Spec objects and AbstractContainer attributes ''' @@ -505,9 +506,9 @@ def constructor_arg(**kwargs): '''Decorator to override the default mapping scheme for a given constructor argument. Decorate ObjectMapper methods with this function when extending ObjectMapper to override the default - scheme for mapping between Container and Builder objects. The decorated method should accept as its + scheme for mapping between AbstractContainer and Builder objects. The decorated method should accept as its first argument the Builder object that is being mapped. The method should return the value to be passed - to the target Container class constructor argument given by *name*. + to the target AbstractContainer class constructor argument given by *name*. ''' name = getargs('name', kwargs) return _constructor_arg(name) @@ -521,8 +522,8 @@ def object_attr(**kwargs): '''Decorator to override the default mapping scheme for a given object attribute. Decorate ObjectMapper methods with this function when extending ObjectMapper to override the default - scheme for mapping between Container and Builder objects. The decorated method should accept as its - first argument the Container object that is being mapped. The method should return the child Builder + scheme for mapping between AbstractContainer and Builder objects. The decorated method should accept as its + first argument the AbstractContainer object that is being mapped. The method should return the child Builder object (or scalar if the object attribute corresponds to an AttributeSpec) that represents the attribute given by *name*. ''' @@ -564,7 +565,7 @@ def __gather_procedures(cls, name, bases, classdict): @docval({'name': 'spec', 'type': (DatasetSpec, GroupSpec), 'doc': 'The specification for mapping objects to builders'}) def __init__(self, **kwargs): - """ Create a map from Container attributes to specifications """ + """ Create a map from AbstractContainer attributes to specifications """ spec = getargs('spec', kwargs) self.__spec = spec self.__data_type_key = spec.type_key() @@ -713,7 +714,7 @@ def get_attribute(self, **kwargs): return val @docval({"name": "spec", "type": Spec, "doc": "the spec to get the attribute value for"}, - {"name": "container", "type": Container, "doc": "the container to get the attribute value from"}, + {"name": "container", "type": AbstractContainer, "doc": "the container to get the attribute value from"}, {"name": "manager", "type": BuildManager, "doc": "the BuildManager used for managing this build"}, returns='the value of the attribute') def get_attr_value(self, **kwargs): @@ -771,16 +772,16 @@ def get_const_arg(self, **kwargs): spec = getargs('spec', kwargs) return self.__spec2carg.get(spec, None) - @docval({"name": "container", "type": Container, "doc": "the container to convert to a Builder"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the container to convert to a Builder"}, {"name": "manager", "type": BuildManager, "doc": "the BuildManager to use for managing this build"}, {"name": "parent", "type": Builder, "doc": "the parent of the resulting Builder", 'default': None}, {"name": "source", "type": str, "doc": "the source of container being built i.e. file path", 'default': None}, {"name": "builder", "type": GroupBuilder, "doc": "the Builder to build on", 'default': None}, {"name": "spec_ext", "type": BaseStorageSpec, "doc": "a spec extension", 'default': None}, - returns="the Builder representing the given Container", rtype=Builder) + returns="the Builder representing the given AbstractContainer", rtype=Builder) def build(self, **kwargs): - ''' Convert a Container to a Builder representation ''' + ''' Convert a AbstractContainer to a Builder representation ''' container, manager, parent, source = getargs('container', 'manager', 'parent', 'source', kwargs) spec_ext = getargs('spec_ext', kwargs) builder = getargs('builder', kwargs) @@ -860,13 +861,14 @@ def __check_dset_spec(self, orig, ext): def __is_reftype(self, data): tmp = data - while hasattr(tmp, '__len__') and not isinstance(tmp, (Container, text_type, binary_type)): + while hasattr(tmp, '__len__') and not isinstance(tmp, (AbstractContainer, text_type, binary_type)): tmptmp = None for t in tmp: # In case of a numeric array stop the iteration at the first element to avoid long-running loop if isinstance(t, (integer_types, float, complex, bool)): break - if hasattr(t, '__len__') and not isinstance(t, (Container, text_type, binary_type)) and len(t) > 0: + if hasattr(t, '__len__') and len(t) > 0 and \ + not isinstance(t, (AbstractContainer, text_type, binary_type)): tmptmp = tmp[0] break if tmptmp is not None: @@ -876,7 +878,7 @@ def __is_reftype(self, data): tmp = None else: tmp = tmp[0] - if isinstance(tmp, Container): + if isinstance(tmp, AbstractContainer): return True else: return False @@ -926,7 +928,8 @@ def __add_attributes(self, builder, attributes, container, build_manager, source msg = "object of data_type %s not found on %s '%s'" % \ (spec.dtype.target_type, type(container).__name__, container.name) else: - msg = "invalid type for reference '%s' (%s) - must be Container" % (spec.name, type(attr_value)) + msg = "invalid type for reference '%s' (%s) - "\ + "must be AbstractContainer" % (spec.name, type(attr_value)) raise ValueError(msg) target_builder = build_manager.build(attr_value, source=source) attr_value = ReferenceBuilder(target_builder) @@ -1020,7 +1023,7 @@ def __add_groups(self, builder, groups, container, build_manager, source): self.__add_containers(builder, spec, attr_value, build_manager, source, container) def __add_containers(self, builder, spec, value, build_manager, source, parent_container): - if isinstance(value, Container): + if isinstance(value, AbstractContainer): if value.parent is None: msg = "'%s' (%s) for '%s' (%s)"\ % (value.name, getattr(value, self.spec.type_key()), @@ -1032,7 +1035,7 @@ def __add_containers(self, builder, spec, value, build_manager, source, parent_c else: rendered_obj = build_manager.build(value, source=source) # use spec to determine what kind of HDF5 - # object this Container corresponds to + # object this AbstractContainer corresponds to if isinstance(spec, LinkSpec) or value.parent is not parent_container: name = spec.name builder.set_link(LinkBuilder(rendered_obj, name, builder)) @@ -1052,7 +1055,7 @@ def __add_containers(self, builder, spec, value, build_manager, source, parent_c rendered_obj = build_manager.build(value, source=source) builder.set_link(LinkBuilder(rendered_obj, name=spec.name, parent=builder)) else: - raise ValueError("Found unmodified Container with no source - '%s' with parent '%s'" % + raise ValueError("Found unmodified AbstractContainer with no source - '%s' with parent '%s'" % (value.name, parent_container.name)) else: if any(isinstance(value, t) for t in (list, tuple)): @@ -1060,9 +1063,9 @@ def __add_containers(self, builder, spec, value, build_manager, source, parent_c elif isinstance(value, dict): values = value.values() else: - msg = ("received %s, expected Container - 'value' " - "must be an Container a list/tuple/dict of " - "Containers if 'spec' is a GroupSpec") + msg = ("received %s, expected AbstractContainer - 'value' " + "must be an AbstractContainer a list/tuple/dict of " + "AbstractContainers if 'spec' is a GroupSpec") raise ValueError(msg % value.__class__.__name__) for container in values: if container: @@ -1160,18 +1163,24 @@ def __flatten(self, sub_builder, subspec, manager): return tmp @docval({'name': 'builder', 'type': (DatasetBuilder, GroupBuilder), - 'doc': 'the builder to construct the Container from'}, + 'doc': 'the builder to construct the AbstractContainer from'}, {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager for this build'}, - {'name': 'parent', 'type': (Proxy, Container), - 'doc': 'the parent Container/Proxy for the Container being built', 'default': None}) + {'name': 'parent', 'type': (Proxy, AbstractContainer), + 'doc': 'the parent AbstractContainer/Proxy for the AbstractContainer being built', 'default': None}) def construct(self, **kwargs): - ''' Construct an Container from the given Builder ''' + ''' Construct an AbstractContainer from the given Builder ''' builder, manager, parent = getargs('builder', 'manager', 'parent', kwargs) cls = manager.get_cls(builder) # gather all subspecs subspecs = self.__get_subspec_values(builder, self.spec, manager) # get the constructor argument that each specification corresponds to const_args = dict() + # For Data container classes, we need to populate the data constructor argument since + # there is no sub-specification that maps to that argument under the default logic + if issubclass(cls, Data): + if not isinstance(builder, DatasetBuilder): + raise ValueError('Can only construct a Data object from a DatasetBuilder - got %s' % type(builder)) + const_args['data'] = builder.data for subspec, value in subspecs.items(): const_arg = self.get_const_arg(subspec) if const_arg is not None: @@ -1201,9 +1210,10 @@ def construct(self, **kwargs): raise_from(Exception(msg), ex) return obj - @docval({'name': 'container', 'type': Container, 'doc': 'the Container to get the Builder name for'}) + @docval({'name': 'container', 'type': AbstractContainer, + 'doc': 'the AbstractContainer to get the Builder name for'}) def get_builder_name(self, **kwargs): - '''Get the name of a Builder that represents a Container''' + '''Get the name of a Builder that represents a AbstractContainer''' container = getargs('container', kwargs) if self.__spec.name not in (NAME_WILDCARD, None): ret = self.__spec.name @@ -1242,7 +1252,7 @@ def data_type(self): class TypeMap(object): - ''' A class to maintain the map between ObjectMappers and Container classes + ''' A class to maintain the map between ObjectMappers and AbstractContainer classes ''' @docval({'name': 'namespaces', 'type': NamespaceCatalog, 'doc': 'the NamespaceCatalog to use', 'default': None}, @@ -1284,13 +1294,13 @@ def copy_mappers(self, type_map): if container_cls in type_map.__mapper_cls: self.register_map(container_cls, type_map.__mapper_cls[container_cls]) - def merge(self, type_map): + def merge(self, type_map, ns_catalog=False): + if ns_catalog: + self.namespace_catalog.merge(type_map.namespace_catalog) for namespace in type_map.__container_types: for data_type in type_map.__container_types[namespace]: - container_cls = type_map.__container_types[namespace][data_type] self.register_container_type(namespace, data_type, container_cls) - for container_cls in type_map.__mapper_cls: self.register_map(container_cls, type_map.__mapper_cls[container_cls]) @@ -1353,7 +1363,7 @@ def __get_type(self, spec): else: return 'array_data', 'data' if isinstance(spec, LinkSpec): - return Container + return AbstractContainer if spec.data_type_def is not None: return self.__get_container_type(spec.data_type_def) if spec.data_type_inc is not None: @@ -1392,7 +1402,7 @@ def __get_cls_dict(self, base, addl_fields, name=None, default_name=None): docval_args = list() new_args = list() fields = list() - for arg in get_docval(base.__init__): + for arg in deepcopy(get_docval(base.__init__)): existing_args.add(arg['name']) if arg['name'] in addl_fields: continue @@ -1436,7 +1446,7 @@ def __init__(self, **kwargs): return {'__init__': __init__, base._fieldsname: tuple(fields)} @docval({"name": "namespace", "type": str, "doc": "the namespace containing the data_type"}, - {"name": "data_type", "type": str, "doc": "the data type to create a Container class for"}, + {"name": "data_type", "type": str, "doc": "the data type to create a AbstractContainer class for"}, returns='the class for the given namespace and data_type', rtype=type) def get_container_cls(self, **kwargs): '''Get the container class from data type specification @@ -1495,6 +1505,22 @@ def __get_container_cls(self, namespace, data_type): self.register_container_type(namespace, data_type, ret) return ret + @docval({'name': 'obj', 'type': (GroupBuilder, DatasetBuilder, LinkBuilder, + GroupSpec, DatasetSpec), + 'doc': 'the object to get the type key for'}) + def __type_key(self, obj): + """ + A wrapper function to simplify the process of getting a type_key for an object. + + The type_key is used to get the data_type from a Builder's attributes. + """ + if isinstance(obj, LinkBuilder): + obj = obj.builder + if isinstance(obj, (GroupBuilder, GroupSpec)): + return self.__ns_catalog.group_spec_cls.type_key() + else: + return self.__ns_catalog.dataset_spec_cls.type_key() + @docval({'name': 'builder', 'type': (DatasetBuilder, GroupBuilder, LinkBuilder), 'doc': 'the builder to get the data_type for'}) def get_builder_dt(self, **kwargs): @@ -1502,7 +1528,13 @@ def get_builder_dt(self, **kwargs): Get the data_type of a builder ''' builder = getargs('builder', kwargs) - ret = builder.attributes.get(self.__ns_catalog.group_spec_cls.type_key()) + ret = None + if isinstance(builder, LinkBuilder): + builder = builder.builder + if isinstance(builder, GroupBuilder): + ret = builder.attributes.get(self.__ns_catalog.group_spec_cls.type_key()) + else: + ret = builder.attributes.get(self.__ns_catalog.dataset_spec_cls.type_key()) if isinstance(ret, bytes): ret = ret.decode('UTF-8') return ret @@ -1520,7 +1552,7 @@ def get_builder_ns(self, **kwargs): return ret @docval({'name': 'builder', 'type': Builder, - 'doc': 'the Builder object to get the corresponding Container class for'}) + 'doc': 'the Builder object to get the corresponding AbstractContainer class for'}) def get_cls(self, **kwargs): ''' Get the class object for the given Builder ''' builder = getargs('builder', kwargs) @@ -1585,13 +1617,13 @@ def get_container_classes(self, **kwargs): ret = filter(lambda x: self.__data_types[x][0] == namespace, ret) return list(ret) - @docval({'name': 'obj', 'type': (Container, Builder), 'doc': 'the object to get the ObjectMapper for'}, + @docval({'name': 'obj', 'type': (AbstractContainer, Builder), 'doc': 'the object to get the ObjectMapper for'}, returns='the ObjectMapper to use for mapping the given object', rtype='ObjectMapper') def get_map(self, **kwargs): """ Return the ObjectMapper object that should be used for the given container """ obj = getargs('obj', kwargs) # get the container class, and namespace/data_type - if isinstance(obj, Container): + if isinstance(obj, AbstractContainer): container_cls = obj.__class__ namespace, data_type = self.get_container_cls_dt(container_cls) if namespace is None: @@ -1628,7 +1660,7 @@ def register_container_type(self, **kwargs): setattr(container_cls, 'namespace', namespace) @docval({"name": "container_cls", "type": type, - "doc": "the Container class for which the given ObjectMapper class gets used for"}, + "doc": "the AbstractContainer class for which the given ObjectMapper class gets used for"}, {"name": "mapper_cls", "type": type, "doc": "the ObjectMapper class to use to map"}) def register_map(self, **kwargs): ''' Map a container class to an ObjectMapper class ''' @@ -1637,7 +1669,7 @@ def register_map(self, **kwargs): raise ValueError('cannot register map for type %s - no data_type found' % container_cls) self.__mapper_cls[container_cls] = mapper_cls - @docval({"name": "container", "type": Container, "doc": "the container to convert to a Builder"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the container to convert to a Builder"}, {"name": "manager", "type": BuildManager, "doc": "the BuildManager to use for managing this build", 'default': None}, {"name": "source", "type": str, @@ -1645,7 +1677,7 @@ def register_map(self, **kwargs): {"name": "builder", "type": GroupBuilder, "doc": "the Builder to build on", 'default': None}, {"name": "spec_ext", "type": BaseStorageSpec, "doc": "a spec extension", 'default': None}) def build(self, **kwargs): - """ Build the GroupBuilder for the given Container""" + """ Build the GroupBuilder for the given AbstractContainer""" container, manager, builder = getargs('container', 'manager', 'builder', kwargs) source, spec_ext = getargs('source', 'spec_ext', kwargs) if manager is None: @@ -1657,18 +1689,18 @@ def build(self, **kwargs): builder = attr_map.build(container, manager, builder=builder, source=source, spec_ext=spec_ext) namespace, data_type = self.get_container_ns_dt(container) builder.set_attribute('namespace', namespace) - builder.set_attribute(attr_map.spec.type_key(), data_type) + builder.set_attribute(self.__type_key(attr_map.spec), data_type) builder.set_attribute(attr_map.spec.id_key(), container.object_id) return builder @docval({'name': 'builder', 'type': (DatasetBuilder, GroupBuilder), - 'doc': 'the builder to construct the Container from'}, + 'doc': 'the builder to construct the AbstractContainer from'}, {'name': 'build_manager', 'type': BuildManager, 'doc': 'the BuildManager for constructing', 'default': None}, {'name': 'parent', 'type': (Proxy, Container), 'doc': 'the parent Container/Proxy for the Container being built', 'default': None}) def construct(self, **kwargs): - """ Construct the Container represented by the given builder """ + """ Construct the AbstractContainer represented by the given builder """ builder, build_manager, parent = getargs('builder', 'build_manager', 'parent', kwargs) if build_manager is None: build_manager = BuildManager(self) @@ -1679,7 +1711,7 @@ def construct(self, **kwargs): else: return attr_map.construct(builder, build_manager, parent) - @docval({"name": "container", "type": Container, "doc": "the container to convert to a Builder"}, + @docval({"name": "container", "type": AbstractContainer, "doc": "the container to convert to a Builder"}, returns='The name a Builder should be given when building this container', rtype=str) def get_builder_name(self, **kwargs): ''' Get the name a Builder should be given ''' diff --git a/src/hdmf/common/__init__.py b/src/hdmf/common/__init__.py new file mode 100644 index 000000000..b15ea5cb4 --- /dev/null +++ b/src/hdmf/common/__init__.py @@ -0,0 +1,191 @@ +'''This package will contain functions, classes, and objects +for reading and writing data in according to the HDMF-common specification +''' +import os.path +from copy import deepcopy + +CORE_NAMESPACE = 'hdmf-common' + +from ..spec import NamespaceCatalog # noqa: E402 +from ..utils import docval, getargs, call_docval_func # noqa: E402 +from ..backends.io import HDMFIO # noqa: E402 +from ..validate import ValidatorMap # noqa: E402 +from ..build import BuildManager, TypeMap # noqa: E402 + + +# a global type map +global __TYPE_MAP + + +# a function to register a container classes with the global map +@docval({'name': 'data_type', 'type': str, 'doc': 'the data_type to get the spec for'}, + {'name': 'namespace', 'type': str, 'doc': 'the name of the namespace', 'default': CORE_NAMESPACE}, + {"name": "container_cls", "type": type, + "doc": "the class to map to the specified data_type", 'default': None}, + is_method=False) +def register_class(**kwargs): + """Register an Container class to use for reading and writing a data_type from a specification + If container_cls is not specified, returns a decorator for registering an Container subclass + as the class for data_type in namespace. + """ + data_type, namespace, container_cls = getargs('data_type', 'namespace', 'container_cls', kwargs) + + def _dec(cls): + __TYPE_MAP.register_container_type(namespace, data_type, cls) + return cls + if container_cls is None: + return _dec + else: + _dec(container_cls) + + +# a function to register an object mapper for a container class +@docval({"name": "container_cls", "type": type, + "doc": "the Container class for which the given ObjectMapper class gets used for"}, + {"name": "mapper_cls", "type": type, "doc": "the ObjectMapper class to use to map", 'default': None}, + is_method=False) +def register_map(**kwargs): + """Register an ObjectMapper to use for a Container class type + If mapper_cls is not specified, returns a decorator for registering an ObjectMapper class + as the mapper for container_cls. If mapper_cls specified, register the class as the mapper for container_cls + """ + container_cls, mapper_cls = getargs('container_cls', 'mapper_cls', kwargs) + + def _dec(cls): + __TYPE_MAP.register_map(container_cls, cls) + return cls + if mapper_cls is None: + return _dec + else: + _dec(mapper_cls) + + +def __get_resources(): + from pkg_resources import resource_filename + from os.path import join + __core_ns_file_name = 'namespace.yaml' + + ret = dict() + ret['namespace_path'] = join(resource_filename(__name__, 'hdmf-common-schema/common'), __core_ns_file_name) + return ret + + +def _get_resources(): + # LEGACY: Needed to support legacy implementation. + return __get_resources() + + +@docval({'name': 'namespace_path', 'type': str, + 'doc': 'the path to the YAML with the namespace definition'}, + returns="the namespaces loaded from the given file", rtype=tuple, + is_method=False) +def load_namespaces(**kwargs): + ''' + Load namespaces from file + ''' + namespace_path = getargs('namespace_path', kwargs) + return __TYPE_MAP.load_namespaces(namespace_path) + + +def available_namespaces(): + return __TYPE_MAP.namespace_catalog.namespaces + + +# load the hdmf-common namespace +__resources = __get_resources() +if os.path.exists(__resources['namespace_path']): + __TYPE_MAP = TypeMap(NamespaceCatalog()) + + load_namespaces(__resources['namespace_path']) + + # import these so the TypeMap gets populated + from . import io as __io # noqa: F401,E402 + + from . import table # noqa: F401,E402 + from . import sparse # noqa: F401,E402 + +else: + raise RuntimeError("Unable to load a TypeMap - no namespace file found") + + +DynamicTable = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'DynamicTable') +VectorData = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'VectorData') +VectorIndex = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'VectorIndex') +ElementIdentifiers = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'ElementIdentifiers') +DynamicTableRegion = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'DynamicTableRegion') +CSRMatrix = __TYPE_MAP.get_container_cls(CORE_NAMESPACE, 'CSRMatrix') + + +@docval({'name': 'extensions', 'type': (str, TypeMap, list), + 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', + 'default': None}, + returns="the namespaces loaded from the given file", rtype=tuple, + is_method=False) +def get_type_map(**kwargs): + ''' + Get a BuildManager to use for I/O using the given extensions. If no extensions are provided, + return a BuildManager that uses the core namespace + ''' + extensions = getargs('extensions', kwargs) + type_map = None + if extensions is None: + type_map = deepcopy(__TYPE_MAP) + else: + if isinstance(extensions, TypeMap): + type_map = extensions + else: + type_map = deepcopy(__TYPE_MAP) + if isinstance(extensions, list): + for ext in extensions: + if isinstance(ext, str): + type_map.load_namespaces(ext) + elif isinstance(ext, TypeMap): + type_map.merge(ext) + else: + msg = 'extensions must be a list of paths to namespace specs or a TypeMaps' + raise ValueError(msg) + elif isinstance(extensions, str): + type_map.load_namespaces(extensions) + elif isinstance(extensions, TypeMap): + type_map.merge(extensions) + return type_map + + +@docval({'name': 'extensions', 'type': (str, TypeMap, list), + 'doc': 'a path to a namespace, a TypeMap, or a list consisting paths to namespaces and TypeMaps', + 'default': None}, + returns="the namespaces loaded from the given file", rtype=tuple, + is_method=False) +def get_manager(**kwargs): + ''' + Get a BuildManager to use for I/O using the given extensions. If no extensions are provided, + return a BuildManager that uses the core namespace + ''' + type_map = call_docval_func(get_type_map, kwargs) + return BuildManager(type_map) + + +# a function to get the container class for a give type +@docval({'name': 'data_type', 'type': str, + 'doc': 'the data_type to get the Container class for'}, + {'name': 'namespace', 'type': str, 'doc': 'the namespace the data_type is defined in'}, + is_method=False) +def get_class(**kwargs): + """Get the class object of the Container subclass corresponding to a given neurdata_type. + """ + data_type, namespace = getargs('data_type', 'namespace', kwargs) + return __TYPE_MAP.get_container_cls(namespace, data_type) + + +@docval({'name': 'io', 'type': HDMFIO, + 'doc': 'the HDMFIO object to read from'}, + {'name': 'namespace', 'type': str, + 'doc': 'the namespace to validate against', 'default': CORE_NAMESPACE}, + returns="errors in the file", rtype=list, + is_method=False) +def validate(**kwargs): + """Validate an file against a namespace""" + io, namespace = getargs('io', 'namespace', kwargs) + builder = io.read_builder() + validator = ValidatorMap(io.manager.namespace_catalog.get_namespace(name=namespace)) + return validator.validate(builder) diff --git a/src/hdmf/common/hdmf-common-schema b/src/hdmf/common/hdmf-common-schema new file mode 160000 index 000000000..d5bb0c755 --- /dev/null +++ b/src/hdmf/common/hdmf-common-schema @@ -0,0 +1 @@ +Subproject commit d5bb0c7550231099e3cbcb9cd4ec9dd47995ca23 diff --git a/src/hdmf/common/io/__init__.py b/src/hdmf/common/io/__init__.py new file mode 100644 index 000000000..bd97aa051 --- /dev/null +++ b/src/hdmf/common/io/__init__.py @@ -0,0 +1 @@ +from . import table # noqa: F401 diff --git a/src/hdmf/common/io/table.py b/src/hdmf/common/io/table.py new file mode 100644 index 000000000..08153ff99 --- /dev/null +++ b/src/hdmf/common/io/table.py @@ -0,0 +1,46 @@ +from ...utils import docval, getargs +from ...build import ObjectMapper, BuildManager +from ...spec import Spec +from ...container import Container +from ..table import DynamicTable, VectorIndex +from .. import register_map + + +@register_map(DynamicTable) +class DynamicTableMap(ObjectMapper): + + def __init__(self, spec): + super(DynamicTableMap, self).__init__(spec) + vector_data_spec = spec.get_data_type('VectorData') + vector_index_spec = spec.get_data_type('VectorIndex') + self.map_spec('columns', vector_data_spec) + self.map_spec('columns', vector_index_spec) + + @ObjectMapper.object_attr('colnames') + def attr_columns(self, container, manager): + if all(len(col) == 0 for col in container.columns): + return tuple() + return container.colnames + + @docval({"name": "spec", "type": Spec, "doc": "the spec to get the attribute value for"}, + {"name": "container", "type": Container, "doc": "the container to get the attribute value from"}, + {"name": "manager", "type": BuildManager, "doc": "the BuildManager used for managing this build"}, + returns='the value of the attribute') + def get_attr_value(self, **kwargs): + ''' Get the value of the attribute corresponding to this spec from the given container ''' + spec, container, manager = getargs('spec', 'container', 'manager', kwargs) + attr_value = super(DynamicTableMap, self).get_attr_value(spec, container, manager) + if attr_value is None and spec.name in container: + if spec.data_type_inc == 'VectorData': + attr_value = container[spec.name] + if isinstance(attr_value, VectorIndex): + attr_value = attr_value.target + elif spec.data_type_inc == 'DynamicTableRegion': + attr_value = container[spec.name] + if attr_value.table is None: + msg = "empty or missing table for DynamicTableRegion '%s' in DynamicTable '%s'" %\ + (attr_value.name, container.name) + raise ValueError(msg) + elif spec.data_type_inc == 'VectorIndex': + attr_value = container[spec.name] + return attr_value diff --git a/src/hdmf/common/sparse.py b/src/hdmf/common/sparse.py new file mode 100644 index 000000000..86f0102e7 --- /dev/null +++ b/src/hdmf/common/sparse.py @@ -0,0 +1,54 @@ +import scipy.sparse as sps +import numpy as np +import h5py + +from ..container import Container +from ..utils import docval, getargs, call_docval_func + +from . import register_class + + +@register_class('CSRMatrix') +class CSRMatrix(Container): + + @docval({'name': 'data', 'type': (sps.csr_matrix, np.ndarray, h5py.Dataset), + 'doc': 'the data to use for this CSRMatrix or CSR data array.' + 'If passing CSR data array, *indices*, *indptr*, and *shape* must also be provided'}, + {'name': 'indices', 'type': (np.ndarray, h5py.Dataset), 'doc': 'CSR index array', 'default': None}, + {'name': 'indptr', 'type': (np.ndarray, h5py.Dataset), 'doc': 'CSR index pointer array', 'default': None}, + {'name': 'shape', 'type': (list, tuple, np.ndarray), 'doc': 'the shape of the matrix', 'default': None}, + {'name': 'name', 'type': str, 'doc': 'the name to use for this when storing', 'default': 'csr_matrix'}) + def __init__(self, **kwargs): + call_docval_func(super().__init__, kwargs) + data = getargs('data', kwargs) + if isinstance(data, (np.ndarray, h5py.Dataset)): + if data.ndim == 2: + data = sps.csr_matrix(self.data) + elif data.ndim == 1: + indptr, indices, shape = getargs('indptr', 'indices', 'shape', kwargs) + if any(_ is None for _ in (indptr, indices, shape)): + raise ValueError("must specify indptr, indices, and shape when passing data array") + self.__check_ind(indptr, 'indptr') + self.__check_ind(indices, 'indices') + if len(shape) != 2: + raise ValueError('shape must specify two and only two dimensions') + data = sps.csr_matrix((data, indices, indptr), shape=shape) + else: + raise ValueError("cannot use ndarray of dimensionality > 2") + self.__data = data + self.__shape = data.shape + + @staticmethod + def __check_ind(ar, arg): + if not (ar.ndim == 1 or np.issubdtype(ar.dtype, int)): + raise ValueError('%s must be a 1D array of integers' % arg) + + def __getattr__(self, val): + return getattr(self.__data, val) + + @property + def shape(self): + return self.__shape + + def to_spmat(self): + return self.__data diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py new file mode 100644 index 000000000..326242e8f --- /dev/null +++ b/src/hdmf/common/table.py @@ -0,0 +1,623 @@ +from h5py import Dataset +import numpy as np +import pandas as pd + +from ..utils import docval, getargs, ExtenderMeta, call_docval_func, popargs, pystr +from ..container import Container, Data + +from . import register_class + + +@register_class('Index') +class Index(Data): + + __fields__ = ("target",) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, + {'name': 'data', 'type': ('array_data', 'data'), + 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'}, + {'name': 'target', 'type': Data, + 'doc': 'the target dataset that this index applies to'}) + def __init__(self, **kwargs): + call_docval_func(super(Index, self).__init__, kwargs) + + +@register_class('VectorData') +class VectorData(Data): + + __fields__ = ("description",) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, + {'name': 'description', 'type': str, 'doc': 'a description for this column'}, + {'name': 'data', 'type': ('array_data', 'data'), + 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}) + def __init__(self, **kwargs): + call_docval_func(super(VectorData, self).__init__, kwargs) + self.description = getargs('description', kwargs) + + @docval({'name': 'val', 'type': None, 'doc': 'the value to add to this column'}) + def add_row(self, **kwargs): + val = getargs('val', kwargs) + self.data.append(val) + + +@register_class('VectorIndex') +class VectorIndex(Index): + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorIndex'}, + {'name': 'data', 'type': ('array_data', 'data'), + 'doc': 'a 1D dataset containing indexes that apply to VectorData object'}, + {'name': 'target', 'type': VectorData, + 'doc': 'the target dataset that this index applies to'}) + def __init__(self, **kwargs): + call_docval_func(super(VectorIndex, self).__init__, kwargs) + self.target = getargs('target', kwargs) + + def add_vector(self, arg): + self.target.extend(arg) + self.data.append(len(self.target)) + + def add_row(self, arg): + self.add_vector(arg) + + def __getitem_helper(self, arg): + start = 0 if arg == 0 else self.data[arg-1] + end = self.data[arg] + return self.target[start:end] + + def __getitem__(self, arg): + if isinstance(arg, slice): + indices = list(range(*arg.indices(len(self.data)))) + ret = list() + for i in indices: + ret.append(self.__getitem_helper(i)) + return ret + else: + return self.__getitem_helper(arg) + + +@register_class('ElementIdentifiers') +class ElementIdentifiers(Data): + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this ElementIdentifiers'}, + {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'a 1D dataset containing identifiers', + 'default': list()}) + def __init__(self, **kwargs): + call_docval_func(super(ElementIdentifiers, self).__init__, kwargs) + + +@register_class('DynamicTable') +class DynamicTable(Container): + r""" + A column-based table. Columns are defined by the argument *columns*. This argument + must be a list/tuple of :class:`~hdmf.common.table.VectorData` and :class:`~hdmf.common.table.VectorIndex` objects + or a list/tuple of dicts containing the keys ``name`` and ``description`` that provide the name and description + of each column in the table. Additionally, the keys ``index`` and ``table`` for specifying additional structure to + the table columns. Setting the key ``index`` to ``True`` can be used to indicate that the + :class:`~hdmf.common.table.VectorData` column will store a ragged array (i.e. will be accompanied with a + :class:`~hdmf.common.table.VectorIndex`). Setting the key ``table`` to ``True`` can be used to indicate that the + column will store regions to another DynamicTable. + + Columns in DynamicTable subclasses can be statically defined by specifying the class attribute *\_\_columns\_\_*, + rather than specifying them at runtime at the instance level. This is useful for defining a table structure + that will get reused. The requirements for *\_\_columns\_\_* are the same as the requirements described above + for specifying table columns with the *columns* argument to the DynamicTable constructor. + """ + + __fields__ = ( + {'name': 'id', 'child': True}, + {'name': 'columns', 'child': True}, + 'colnames', + 'description' + ) + + __columns__ = tuple() + + @ExtenderMeta.pre_init + def __gather_columns(cls, name, bases, classdict): + ''' + This classmethod will be called during class declaration in the metaclass to automatically + include all columns declared in subclasses + ''' + if not isinstance(cls.__columns__, tuple): + msg = "'__columns__' must be of type tuple, found %s" % type(cls.__columns__) + raise TypeError(msg) + + if len(bases) and 'DynamicTable' in globals() and issubclass(bases[-1], Container) \ + and bases[-1].__columns__ is not cls.__columns__: + new_columns = list(cls.__columns__) + new_columns[0:0] = bases[-1].__columns__ + cls.__columns__ = tuple(new_columns) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this table'}, # noqa: C901 + {'name': 'description', 'type': str, 'doc': 'a description of what is in this table'}, + {'name': 'id', 'type': ('array_data', ElementIdentifiers), 'doc': 'the identifiers for this table', + 'default': None}, + {'name': 'columns', 'type': (tuple, list), 'doc': 'the columns in this table', 'default': None}, + {'name': 'colnames', 'type': 'array_data', 'doc': 'the names of the columns in this table', + 'default': None}) + def __init__(self, **kwargs): + id, columns, desc, colnames = popargs('id', 'columns', 'description', 'colnames', kwargs) + call_docval_func(super(DynamicTable, self).__init__, kwargs) + self.description = desc + + # All tables must have ElementIdentifiers (i.e. a primary key column) + # Here, we figure out what to do for that + if id is not None: + if not isinstance(id, ElementIdentifiers): + id = ElementIdentifiers('id', data=id) + else: + id = ElementIdentifiers('id') + + if columns is not None: + if len(columns) > 0: + # If columns have been passed in, check them over + # and process accordingly + if isinstance(columns[0], dict): + columns = self.__build_columns(columns) + elif not all(isinstance(c, (VectorData, VectorIndex)) for c in columns): + raise ValueError("'columns' must be a list of VectorData, DynamicTableRegion or VectorIndex") + colset = {c.name: c for c in columns} + for c in columns: + if isinstance(c, VectorIndex): + colset.pop(c.target.name) + lens = [len(c) for c in colset.values()] + if not all(i == lens[0] for i in lens): + raise ValueError("columns must be the same length") + if lens[0] != len(id): + if len(id) > 0: + raise ValueError("must provide same number of ids as length of columns") + else: + id.data.extend(range(lens[0])) + else: + # if the user has not passed in columns, make a place to put them, + # as they will presumably be adding new columns + columns = list() + + self.id = id + + if colnames is None: + if columns is None: + # make placeholder for columns if nothing was given + self.colnames = list() + self.columns = list() + else: + # Figure out column names if columns were given + tmp = list() + for col in columns: + if isinstance(col, VectorIndex): + continue + tmp.append(col.name) + self.colnames = tuple(tmp) + self.columns = columns + else: + # Calculate the order of column names + if columns is None: + raise ValueError("Must supply 'columns' if specifying 'colnames'") + else: + # order the columns according to the column names + self.colnames = tuple(pystr(c) for c in colnames) + col_dict = {col.name: col for col in columns} + order = dict() + indexed = dict() + for col in columns: + if isinstance(col, VectorIndex): + indexed[col.target.name] = True + else: + if col.name in indexed: + continue + indexed[col.name] = False + i = 0 + for name in self.colnames: + col = col_dict[name] + order[col.name] = i + if indexed[col.name]: + i = i + 1 + i = i + 1 + tmp = [None] * i + for col in columns: + if indexed.get(col.name, False): + continue + if isinstance(col, VectorData): + pos = order[col.name] + tmp[pos] = col + elif isinstance(col, VectorIndex): + pos = order[col.target.name] + tmp[pos] = col + tmp[pos+1] = col.target + self.columns = list(tmp) + + # to make generating DataFrames and Series easier + col_dict = dict() + self.__indices = dict() + for col in self.columns: + if isinstance(col, VectorData): + existing = col_dict.get(col.name) + # if we added this column using its index, ignore this column + if existing is not None: + if isinstance(existing, VectorIndex): + if existing.target.name == col.name: + continue + else: + raise ValueError("duplicate column does not target VectorData '%s'" % col.name) + else: + raise ValueError("duplicate column found: '%s'" % col.name) + else: + col_dict[col.name] = col + elif isinstance(col, VectorIndex): + col_dict[col.target.name] = col # use target name for reference and VectorIndex for retrieval + self.__indices[col.name] = col + + self.__df_cols = [self.id] + [col_dict[name] for name in self.colnames] + self.__colids = {name: i+1 for i, name in enumerate(self.colnames)} + for col in self.__columns__: + if col.get('required', False) and col['name'] not in self.__colids: + self.add_column(col['name'], col['description'], + index=col.get('index', False), + table=col.get('table', False)) + + @staticmethod + def __build_columns(columns, df=None): + """ + Build column objects according to specifications + """ + tmp = list() + for d in columns: + name = d['name'] + desc = d.get('description', 'no description') + data = None + if df is not None: + data = list(df[name].values) + if d.get('index', False): + index_data = None + if data is not None: + index_data = [len(data[0])] + for i in range(1, len(data)): + index_data.append(len(data[i]) + index_data[i-1]) + # assume data came in through a DataFrame, so we need + # to concatenate it + tmp_data = list() + for d in data: + tmp_data.extend(d) + data = tmp_data + vdata = VectorData(name, desc, data=data) + vindex = VectorIndex("%s_index" % name, index_data, target=vdata) + tmp.append(vindex) + tmp.append(vdata) + else: + if data is None: + data = list() + cls = VectorData + if d.get('table', False): + cls = DynamicTableRegion + tmp.append(cls(name, desc, data=data)) + return tmp + + def __len__(self): + return len(self.id) + + @docval({'name': 'data', 'type': dict, 'doc': 'the data to put in this row', 'default': None}, + {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None}, + allow_extra=True) + def add_row(self, **kwargs): + ''' + Add a row to the table. If *id* is not provided, it will auto-increment. + ''' + data, row_id = popargs('data', 'id', kwargs) + data = data if data is not None else kwargs + + extra_columns = set(list(data.keys())) - set(list(self.__colids.keys())) + missing_columns = set(list(self.__colids.keys())) - set(list(data.keys())) + + # check to see if any of the extra columns just need to be added + if extra_columns: + for col in self.__columns__: + if col['name'] in extra_columns: + if data[col['name']] is not None: + self.add_column(col['name'], col['description'], + index=col.get('index', False), + table=col.get('table', False)) + extra_columns.remove(col['name']) + + if extra_columns or missing_columns: + raise ValueError( + '\n'.join([ + 'row data keys don\'t match available columns', + 'you supplied {} extra keys: {}'.format(len(extra_columns), extra_columns), + 'and were missing {} keys: {}'.format(len(missing_columns), missing_columns) + ]) + ) + + if row_id is None: + row_id = data.pop('id', None) + if row_id is None: + row_id = len(self) + self.id.data.append(row_id) + + for colname, colnum in self.__colids.items(): + if colname not in data: + raise ValueError("column '%s' missing" % colname) + c = self.__df_cols[colnum] + if isinstance(c, VectorIndex): + c.add_vector(data[colname]) + else: + c.add_row(data[colname]) + + def __eq__(self, other): + return self.to_dataframe().equals(other.to_dataframe()) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, + {'name': 'description', 'type': str, 'doc': 'a description for this column'}, + {'name': 'data', 'type': ('array_data', 'data'), + 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors', 'default': list()}, + {'name': 'table', 'type': (bool, 'DynamicTable'), + 'doc': 'whether or not this is a table region or the table the region applies to', 'default': False}, + {'name': 'index', 'type': (bool, VectorIndex, 'array_data'), + 'doc': 'whether or not this column should be indexed', 'default': False}) + def add_column(self, **kwargs): + """ + Add a column to this table. If data is provided, it must + contain the same number of rows as the current state of the table. + """ + name, data = getargs('name', 'data', kwargs) + index, table = popargs('index', 'table', kwargs) + if name in self.__colids: + msg = "column '%s' already exists in DynamicTable '%s'" % (name, self.name) + raise ValueError(msg) + + ckwargs = dict(kwargs) + cls = VectorData + + # Add table if it's been specified + if table is not False: + cls = DynamicTableRegion + if isinstance(table, DynamicTable): + ckwargs['table'] = table + + col = cls(**ckwargs) + col.parent = self + columns = [col] + + # Add index if it's been specified + if index is not False: + if isinstance(index, VectorIndex): + col_index = index + elif isinstance(index, bool): # make empty VectorIndex + if len(col) > 0: + raise ValueError("cannot pass empty index with non-empty data to index") + col_index = VectorIndex(name + "_index", list(), col) + else: # make VectorIndex with supplied data + if len(col) == 0: + raise ValueError("cannot pass non-empty index with empty data to index") + col_index = VectorIndex(name + "_index", index, col) + columns.insert(0, col_index) + if not isinstance(col_index.parent, Container): + col_index.parent = self + # else, the ObjectMapper will create a link from self (parent) to col_index (child with existing parent) + col = col_index + self.__indices[col_index.name] = col_index + + if len(col) != len(self.id): + raise ValueError("column must have the same number of rows as 'id'") + self.__colids[name] = len(self.__df_cols) + self.fields['colnames'] = tuple(list(self.colnames)+[name]) + self.fields['columns'] = tuple(list(self.columns)+columns) + self.__df_cols.append(col) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of the DynamicTableRegion object'}, + {'name': 'region', 'type': (slice, list, tuple), 'doc': 'the indices of the table'}, + {'name': 'description', 'type': str, 'doc': 'a brief description of what the region is'}) + def create_region(self, **kwargs): + region = getargs('region', kwargs) + if isinstance(region, slice): + if (region.start is not None and region.start < 0) or (region.stop is not None and region.stop > len(self)): + msg = 'region slice %s is out of range for this DynamicTable of length ' % (str(region), len(self)) + raise IndexError(msg) + region = list(range(*region.indices(len(self)))) + else: + for idx in region: + if idx < 0 or idx >= len(self): + raise IndexError('The index ' + str(idx) + + ' is out of range for this DynamicTable of length ' + + str(len(self))) + desc = getargs('description', kwargs) + name = getargs('name', kwargs) + return DynamicTableRegion(name, region, desc, self) + + def __getitem__(self, key): + ret = None + if isinstance(key, tuple): + # index by row and column, return specific cell + arg1 = key[0] + arg2 = key[1] + if isinstance(arg2, str): + arg2 = self.__colids[arg2] + ret = self.__df_cols[arg2][arg1] + else: + arg = key + if isinstance(arg, str): + # index by one string, return column + if arg in self.__colids: + ret = self.__df_cols[self.__colids[arg]] + elif arg in self.__indices: + return self.__indices[arg] + else: + raise KeyError(arg) + elif isinstance(arg, (int, np.int8, np.int16, np.int32, np.int64)): + # index by int, return row + ret = tuple(col[arg] for col in self.__df_cols) + elif isinstance(arg, (tuple, list)): + # index by a list of ints, return multiple rows + ret = list() + for i in arg: + ret.append(tuple(col[i] for col in self.__df_cols)) + + return ret + + def __contains__(self, val): + return val in self.__colids or val in self.__indices + + def get(self, key, default=None): + if key in self: + return self[key] + return default + + def to_dataframe(self, exclude=set([])): + '''Produce a pandas DataFrame containing this table's data. + ''' + + data = {} + for name in self.colnames: + if name in exclude: + continue + col = self.__df_cols[self.__colids[name]] + + if isinstance(col.data, (Dataset, np.ndarray)) and col.data.ndim > 1: + data[name] = [x for x in col[:]] + else: + data[name] = col[:] + + return pd.DataFrame(data, index=pd.Index(name=self.id.name, data=self.id.data)) + + @classmethod + @docval( + {'name': 'df', 'type': pd.DataFrame, 'doc': 'source DataFrame'}, + {'name': 'name', 'type': str, 'doc': 'the name of this table'}, + { + 'name': 'index_column', + 'type': str, + 'doc': 'if provided, this column will become the table\'s index', + 'default': None + }, + { + 'name': 'table_description', + 'type': str, + 'doc': 'a description of what is in the resulting table', + 'default': '' + }, + { + 'name': 'columns', + 'type': (list, tuple), + 'doc': 'a list/tuple of dictionaries specifying columns in the table', + 'default': None + }, + allow_extra=True + ) + def from_dataframe(cls, **kwargs): + ''' + Construct an instance of DynamicTable (or a subclass) from a pandas DataFrame. + + The columns of the resulting table are defined by the columns of the + dataframe and the index by the dataframe's index (make sure it has a + name!) or by a column whose name is supplied to the index_column + parameter. We recommend that you supply *columns* - a list/tuple of + dictionaries containing the name and description of the column- to help + others understand the contents of your table. See + :py:class:`~hdmf.common.table.DynamicTable` for more details on *columns*. + ''' + + columns = kwargs.pop('columns') + df = kwargs.pop('df') + name = kwargs.pop('name') + index_column = kwargs.pop('index_column') + table_description = kwargs.pop('table_description') + column_descriptions = kwargs.pop('column_descriptions', dict()) + + supplied_columns = dict() + if columns: + supplied_columns = {x['name']: x for x in columns} + + class_cols = {x['name']: x for x in cls.__columns__} + required_cols = set(x['name'] for x in cls.__columns__ if 'required' in x and x['required']) + df_cols = df.columns + if required_cols - set(df_cols): + raise ValueError('missing required cols: ' + str(required_cols - set(df_cols))) + if set(supplied_columns.keys()) - set(df_cols): + raise ValueError('cols specified but not provided: ' + str(set(supplied_columns.keys()) - set(df_cols))) + columns = [] + for col_name in df_cols: + if col_name in class_cols: + columns.append(class_cols[col_name]) + elif col_name in supplied_columns: + columns.append(supplied_columns[col_name]) + else: + columns.append({'name': col_name, + 'description': column_descriptions.get(col_name, 'no description')}) + if hasattr(df[col_name].iloc[0], '__len__') and not isinstance(df[col_name].iloc[0], str): + lengths = [len(x) for x in df[col_name]] + if not lengths[1:] == lengths[:-1]: + columns[-1].update(index=True) + + if index_column is not None: + ids = ElementIdentifiers(name=index_column, data=df[index_column].values.tolist()) + else: + index_name = df.index.name if df.index.name is not None else 'id' + ids = ElementIdentifiers(name=index_name, data=df.index.values.tolist()) + + columns = cls.__build_columns(columns, df=df) + + return cls(name=name, id=ids, columns=columns, description=table_description, **kwargs) + + def copy(self): + """ + Return a copy of this DynamicTable. + This is useful for linking. + """ + kwargs = dict(name=self.name, id=self.id, columns=self.columns, description=self.description, + colnames=self.colnames) + return self.__class__(**kwargs) + + +@register_class('DynamicTableRegion') +class DynamicTableRegion(VectorData): + """ + An object for easily slicing into a DynamicTable + """ + + __fields__ = ( + 'table', + 'description' + ) + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this VectorData'}, + {'name': 'data', 'type': ('array_data', 'data'), + 'doc': 'a dataset where the first dimension is a concatenation of multiple vectors'}, + {'name': 'description', 'type': str, 'doc': 'a description of what this region represents'}, + {'name': 'table', 'type': DynamicTable, + 'doc': 'the DynamicTable this region applies to', 'default': None}) + def __init__(self, **kwargs): + t = popargs('table', kwargs) + call_docval_func(super(DynamicTableRegion, self).__init__, kwargs) + self.table = t + + @property + def table(self): + return self.fields.get('table') + + @table.setter + def table(self, val): + if val is None: + return + if 'table' in self.fields: + msg = "can't set attribute 'table' -- already set" + raise AttributeError(msg) + for idx in self.data: + if idx < 0 or idx >= len(val): + raise IndexError('The index ' + str(idx) + + ' is out of range for this DynamicTable of length ' + + str(len(val))) + self.fields['table'] = val + + def __getitem__(self, key): + # treat the list of indices as data that can be indexed. then pass the + # result to the table to get the data + if isinstance(key, tuple): + arg1 = key[0] + arg2 = key[1] + return self.table[self.data[arg1], arg2] + elif isinstance(key, (int, slice)): + if isinstance(key, int) and key >= len(self.data): + raise IndexError('index {} out of bounds for data of length {}'.format(key, len(self.data))) + return self.table[self.data[key]] + else: + raise ValueError("unrecognized argument: '%s'" % key) diff --git a/src/hdmf/container.py b/src/hdmf/container.py index aa5800f39..a9ab1fa96 100644 --- a/src/hdmf/container.py +++ b/src/hdmf/container.py @@ -1,19 +1,105 @@ +import numpy as np from abc import abstractmethod from uuid import uuid4 from six import with_metaclass -from .utils import docval, getargs, ExtenderMeta +from .utils import docval, get_docval, call_docval_func, getargs, ExtenderMeta from warnings import warn -class Container(with_metaclass(ExtenderMeta, object)): +class AbstractContainer(with_metaclass(ExtenderMeta, object)): + # The name of the class attribute that subclasses use to autogenerate properties + # This parameterization is supplied in case users would like to configure + # the class attribute name to something domain-specific _fieldsname = '__fields__' - # @docval({'name': 'container_source', 'type': str, 'doc': 'source of this Container', 'default': None}, - # {'name': 'object_id', 'type': str, 'doc': 'UUID4 unique identifier for this Container', 'default': None}, - # {'name': 'parent', 'type': str, 'doc': 'parent Container for this Container', 'default': None}) + _data_type_attr = 'data_type' + + # Subclasses use this class attribute to add properties to autogenerate + # Autogenerated properties will store values in self.__field_values + __fields__ = tuple() + + _pconf_allowed_keys = {'name', 'doc', 'settable'} + + # Override the _setter factor function, so directives that apply to + # Container do not get used on Data + @classmethod + def _setter(cls, field): + """ + Make a setter function for creating a :py:func:`property` + """ + name = field['name'] + + if not field.get('settable', True): + return None + + def setter(self, val): + if val is None: + return + if name in self.fields: + msg = "can't set attribute '%s' -- already set" % name + raise AttributeError(msg) + self.fields[name] = val + + return setter + + @classmethod + def _getter(cls, field): + """ + Make a getter function for creating a :py:func:`property` + """ + doc = field.get('doc') + name = field['name'] + + def getter(self): + return self.fields.get(name) + + setattr(getter, '__doc__', doc) + return getter + + @staticmethod + def __check_field_spec(field): + """ + A helper function for __gather_fields to make sure we are always working + with a dict specification and that the specification contains the correct keys + """ + tmp = field + if isinstance(tmp, dict): + if 'name' not in tmp: + raise ValueError("must specify 'name' if using dict in __fields__") + else: + tmp = {'name': tmp} + return tmp + + @ExtenderMeta.pre_init + def __gather_fields(cls, name, bases, classdict): + ''' + This classmethod will be called during class declaration in the metaclass to automatically + create setters and getters for fields that need to be exported + ''' + fields = getattr(cls, cls._fieldsname) + if not isinstance(fields, tuple): + msg = "'%s' must be of type tuple" % cls._fieldsname + raise TypeError(msg) + + if len(bases) and 'Container' in globals() and issubclass(bases[-1], Container) \ + and getattr(bases[-1], bases[-1]._fieldsname) is not fields: + new_fields = list(fields) + new_fields[0:0] = getattr(bases[-1], bases[-1]._fieldsname) + setattr(cls, cls._fieldsname, tuple(new_fields)) + new_fields = list() + docs = {dv['name']: dv['doc'] for dv in get_docval(cls.__init__)} + for f in getattr(cls, cls._fieldsname): + pconf = cls.__check_field_spec(f) + pname = pconf['name'] + pconf.setdefault('doc', docs.get(pname)) + if not hasattr(cls, pname): + setattr(cls, pname, property(cls._getter(pconf), cls._setter(pconf))) + new_fields.append(pname) + setattr(cls, cls._fieldsname, tuple(new_fields)) + def __new__(cls, *args, **kwargs): - inst = super(Container, cls).__new__(cls) + inst = super().__new__(cls) inst.__container_source = kwargs.pop('container_source', None) inst.__parent = None inst.__children = list() @@ -28,9 +114,33 @@ def __init__(self, **kwargs): if '/' in name: raise ValueError("name '" + name + "' cannot contain '/'") self.__name = name + self.__field_values = dict() - def __repr__(self): - return "<%s '%s' at 0x%d>" % (self.__class__.__name__, self.name, id(self)) + @property + def name(self): + ''' + The name of this Container + ''' + return self.__name + + @docval({'name': 'data_type', 'type': str, 'doc': 'the data_type to search for', 'default': None}) + def get_ancestor(self, **kwargs): + """ + Traverse parent hierarchy and return first instance of the specified data_type + """ + data_type = getargs('data_type', kwargs) + if data_type is None: + return self.parent + p = self.parent + while p is not None: + if getattr(p, p._data_type_attr) == data_type: + return p + p = p.parent + return None + + @property + def fields(self): + return self.__field_values @property def object_id(self): @@ -63,7 +173,7 @@ def add_child(self, **kwargs): # if child.parent is a Container, then the mismatch between child.parent and parent # is used to make a soft/external link from the parent to a child elsewhere # if child.parent is not a Container, it is either None or a Proxy and should be set to self - if not isinstance(child.parent, Container): + if not isinstance(child.parent, AbstractContainer): # actually add the child to the parent in parent setter child.parent = self else: @@ -73,13 +183,6 @@ def add_child(self, **kwargs): def type_hierarchy(cls): return cls.__mro__ - @property - def name(self): - ''' - The name of this Container - ''' - return self.__name - @property def container_source(self): ''' @@ -99,7 +202,7 @@ def parent(self): The parent Container of this Container ''' # do it this way because __parent may not exist yet (not set in constructor) - return getattr(self, '_Container__parent', None) + return getattr(self, '_AbstractContainer__parent', None) @parent.setter def parent(self, parent_container): @@ -107,7 +210,7 @@ def parent(self, parent_container): return if self.parent is not None: - if isinstance(self.parent, Container): + if isinstance(self.parent, AbstractContainer): raise ValueError(('Cannot reassign parent to Container: %s. ' 'Parent is already: %s.' % (repr(self), repr(self.parent)))) else: @@ -129,21 +232,188 @@ def parent(self, parent_container): parent_container.set_modified() -class Data(Container): +class Container(AbstractContainer): + + _pconf_allowed_keys = {'name', 'child', 'required_name', 'doc', 'settable'} + + @classmethod + def _setter(cls, field): + super_setter = AbstractContainer._setter(field) + ret = [super_setter] + if isinstance(field, dict): + for k in field.keys(): + if k not in cls._pconf_allowed_keys: + msg = "Unrecognized key '%s' in __field__ config '%s' on %s" %\ + (k, field['name'], cls.__name__) + raise ValueError(msg) + if field.get('required_name', None) is not None: + name = field['required_name'] + idx1 = len(ret) - 1 + + def container_setter(self, val): + if val is not None and val.name != name: + msg = "%s field on %s must be named '%s'" % (field['name'], self.__class__.__name__, name) + raise ValueError(msg) + ret[idx1](self, val) + + ret.append(container_setter) + if field.get('child', False): + idx2 = len(ret) - 1 + + def container_setter(self, val): + ret[idx2](self, val) + if val is not None: + if isinstance(val, (tuple, list)): + pass + elif isinstance(val, dict): + val = val.values() + else: + val = [val] + for v in val: + if not isinstance(v.parent, Container): + v.parent = self + # else, the ObjectMapper will create a link from self (parent) to v (child with existing + # parent) + + ret.append(container_setter) + return ret[-1] + + def __repr__(self): + cls = self.__class__ + template = "%s %s.%s at 0x%d" % (self.name, cls.__module__, cls.__name__, id(self)) + if len(self.fields): + template += "\nFields:\n" + for k in sorted(self.fields): # sorted to enable tests + v = self.fields[k] + # if isinstance(v, DataIO) or not hasattr(v, '__len__') or len(v) > 0: + if hasattr(v, '__len__'): + if isinstance(v, (np.ndarray, list, tuple)): + if len(v) > 0: + template += " {}: {}\n".format(k, self.__smart_str(v, 1)) + elif v: + template += " {}: {}\n".format(k, self.__smart_str(v, 1)) + else: + template += " {}: {}\n".format(k, v) + return template + + @staticmethod + def __smart_str(v, num_indent): + """ + Print compact string representation of data. + + If v is a list, try to print it using numpy. This will condense the string + representation of datasets with many elements. If that doesn't work, just print the list. + + If v is a dictionary, print the name and type of each element + + If v is a set, print it sorted + + If v is a neurodata_type, print the name of type + + Otherwise, use the built-in str() + Parameters + ---------- + v + + Returns + ------- + str + + """ + + if isinstance(v, list) or isinstance(v, tuple): + if len(v) and isinstance(v[0], AbstractContainer): + return Container.__smart_str_list(v, num_indent, '(') + try: + return str(np.asarray(v)) + except ValueError: + return Container.__smart_str_list(v, num_indent, '(') + elif isinstance(v, dict): + return Container.__smart_str_dict(v, num_indent) + elif isinstance(v, set): + return Container.__smart_str_list(sorted(list(v)), num_indent, '{') + elif isinstance(v, AbstractContainer): + return "{} {}".format(getattr(v, 'name'), type(v)) + else: + return str(v) + + @staticmethod + def __smart_str_list(l, num_indent, left_br): + if left_br == '(': + right_br = ')' + if left_br == '{': + right_br = '}' + if len(l) == 0: + return left_br + ' ' + right_br + indent = num_indent * 2 * ' ' + indent_in = (num_indent + 1) * 2 * ' ' + out = left_br + for v in l[:-1]: + out += '\n' + indent_in + Container.__smart_str(v, num_indent + 1) + ',' + if l: + out += '\n' + indent_in + Container.__smart_str(l[-1], num_indent + 1) + out += '\n' + indent + right_br + return out + + @staticmethod + def __smart_str_dict(d, num_indent): + left_br = '{' + right_br = '}' + if len(d) == 0: + return left_br + ' ' + right_br + indent = num_indent * 2 * ' ' + indent_in = (num_indent + 1) * 2 * ' ' + out = left_br + keys = sorted(list(d.keys())) + for k in keys[:-1]: + out += '\n' + indent_in + Container.__smart_str(k, num_indent + 1) + ' ' + str(type(d[k])) + ',' + if keys: + out += '\n' + indent_in + Container.__smart_str(keys[-1], num_indent + 1) + ' ' + str(type(d[keys[-1]])) + out += '\n' + indent + right_br + return out + + +class Data(AbstractContainer): + + @docval({'name': 'name', 'type': str, 'doc': 'the name of this container'}, + {'name': 'data', 'type': ('array_data', 'data'), 'doc': 'the source of the data'}) + def __init__(self, **kwargs): + call_docval_func(super(Data, self).__init__, kwargs) + self.__data = getargs('data', kwargs) @property - @abstractmethod def data(self): - ''' - The data that is held by this Container - ''' - pass + return self.__data def __bool__(self): - if not hasattr(self.data, '__len__'): - raise NotImplementedError('__bool__ must be implemented when data has no __len__') return len(self.data) != 0 + def __len__(self): + return len(self.__data) + + def __getitem__(self, args): + if isinstance(self.data, (tuple, list)) and isinstance(args, (tuple, list)): + return [self.data[i] for i in args] + return self.data[args] + + def append(self, arg): + if isinstance(self.data, list): + self.data.append(arg) + elif isinstance(self.data, np.ndarray): + self.__data = np.append(self.__data, [arg]) + else: + msg = "Data cannot append to object of type '%s'" % type(self.__data) + raise ValueError(msg) + + def extend(self, arg): + if isinstance(self.data, list): + self.data.extend(arg) + elif isinstance(self.data, np.ndarray): + self.__data = np.append(self.__data, [arg]) + else: + msg = "Data cannot extend object of type '%s'" % type(self.__data) + raise ValueError(msg) + class DataRegion(Data): diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py index 98445f589..f0ba4d2ca 100644 --- a/src/hdmf/data_utils.py +++ b/src/hdmf/data_utils.py @@ -568,6 +568,9 @@ def __len__(self): raise InvalidDataIOError("Cannot get length of data. Data is not valid.") return len(self.data) + def __bool__(self): + return self.valid and len(self) > 0 + def __getattr__(self, attr): """Delegate attribute lookup to data object""" if not self.valid: diff --git a/src/hdmf/spec/namespace.py b/src/hdmf/spec/namespace.py index 3635426cf..615f04e83 100644 --- a/src/hdmf/spec/namespace.py +++ b/src/hdmf/spec/namespace.py @@ -243,6 +243,10 @@ def __copy__(self): ret.__included_sources = copy(self.__included_sources) return ret + def merge(self, ns_catalog): + for name, namespace in ns_catalog.__namespaces.items(): + self.add_namespace(name, namespace) + @property @docval(returns='a tuple of the available namespaces', rtype=tuple) def namespaces(self): @@ -348,11 +352,11 @@ def __load_spec_file(self, reader, spec_source, catalog, dtypes=None, resolve=Tr raise ValueError("spec source '%s' already loaded" % spec_source) def __reg_spec(spec_cls, spec_dict): - dt_def = spec_dict.get(spec_cls.def_key()) + parent_cls = GroupSpec if issubclass(spec_cls, GroupSpec) else DatasetSpec + dt_def = spec_dict.get(spec_cls.def_key(), spec_dict.get(parent_cls.def_key())) if dt_def is None: - msg = 'skipping spec in %s, no %s found' % (spec_source, spec_cls.def_key()) - warn(msg) - return + msg = 'no %s or %s found in spec %s' % (spec_cls.def_key(), parent_cls.def_key(), spec_source) + raise ValueError(msg) if dtypes and dt_def not in dtypes: return if resolve: @@ -415,6 +419,10 @@ def __load_namespace(self, namespace, reader, types_key, resolve=True): for ndt in types: spec = inc_ns.get_spec(ndt) spec_file = inc_ns.catalog.get_spec_source_file(ndt) + if isinstance(spec, DatasetSpec): + spec = self.dataset_spec_cls.build_spec(spec) + else: + spec = self.group_spec_cls.build_spec(spec) catalog.register_spec(spec, spec_file) included_types[s['namespace']] = tuple(types) # construct namespace diff --git a/src/hdmf/spec/spec.py b/src/hdmf/spec/spec.py index 8d93f9a96..38170294b 100644 --- a/src/hdmf/spec/spec.py +++ b/src/hdmf/spec/spec.py @@ -81,7 +81,6 @@ def build_spec(cls, spec_dict): args = list() kwargs = dict() try: - for x in get_docval(cls.__init__): if not x['name'] in vargs: continue diff --git a/tests/unit/build_tests/test_io_map_data.py b/tests/unit/build_tests/test_io_map_data.py index 09f8191f3..87e294edd 100644 --- a/tests/unit/build_tests/test_io_map_data.py +++ b/tests/unit/build_tests/test_io_map_data.py @@ -15,14 +15,9 @@ class Baz(Data): {'name': 'baz_attr', 'type': str, 'doc': 'an attribute'}) def __init__(self, **kwargs): name, data, baz_attr = getargs('name', 'data', 'baz_attr', kwargs) - super(Baz, self).__init__(name=name) - self.__data = data + super(Baz, self).__init__(name=name, data=data) self.__baz_attr = baz_attr - @property - def data(self): - return self.__data - @property def baz_attr(self): return self.__baz_attr diff --git a/tests/unit/common/__init__.py b/tests/unit/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/common/base.py b/tests/unit/common/base.py new file mode 100644 index 000000000..3befad0c1 --- /dev/null +++ b/tests/unit/common/base.py @@ -0,0 +1,124 @@ +import unittest2 as unittest +import os +import numpy as np +import h5py +import numpy.testing as npt + +from hdmf.common import validate as common_validate, get_manager +from hdmf.container import Container, Data +from hdmf.backends.hdf5 import HDF5IO + + +class HDMFTestCase(unittest.TestCase): + + def assertContainerEqual(self, container1, container2): # noqa: C901 + ''' + container1 is what was read or generated + container2 is what is hardcoded in the TestCase + ''' + type1 = type(container1) + type2 = type(container2) + self.assertEqual(type1, type2) + for nwbfield in container1.__fields__: + with self.subTest(nwbfield=nwbfield, container_type=type1.__name__): + f1 = getattr(container1, nwbfield) + f2 = getattr(container2, nwbfield) + if isinstance(f1, h5py.Dataset): + f1 = f1[()] + if isinstance(f1, (tuple, list, np.ndarray)): + if len(f1) > 0: + if isinstance(f1[0], Container): + for sub1, sub2 in zip(f1, f2): + self.assertContainerEqual(sub1, sub2) + elif isinstance(f1[0], Data): + for sub1, sub2 in zip(f1, f2): + self.assertDataEqual(sub1, sub2) + continue + else: + self.assertEqual(len(f1), len(f2)) + if len(f1) == 0: + continue + if isinstance(f1[0], float): + for v1, v2 in zip(f1, f2): + self.assertAlmostEqual(v1, v2, places=6) + else: + self.assertTrue(np.array_equal(f1, f2)) + elif isinstance(f1, dict) and len(f1) and isinstance(next(iter(f1.values())), Container): + f1_keys = set(f1.keys()) + f2_keys = set(f2.keys()) + self.assertSetEqual(f1_keys, f2_keys) + for k in f1_keys: + with self.subTest(module_name=k): + self.assertContainerEqual(f1[k], f2[k]) + elif isinstance(f1, Container): + self.assertContainerEqual(f1, f2) + elif isinstance(f1, Data) or isinstance(f2, Data): + if isinstance(f1, Data) and isinstance(f2, Data): + self.assertDataEqual(f1, f2) + elif isinstance(f1, Data): + self.assertTrue(np.array_equal(f1.data, f2)) + elif isinstance(f2, Data): + self.assertTrue(np.array_equal(f1.data, f2)) + else: + if isinstance(f1, (float, np.float32, np.float16)): + npt.assert_almost_equal(f1, f2) + else: + self.assertEqual(f1, f2) + + def assertDataEqual(self, data1, data2): + self.assertEqual(type(data1), type(data2)) + self.assertEqual(len(data1), len(data2)) + + +class TestMapRoundTrip(HDMFTestCase): + + def setUpContainer(self): + ''' Should return the Container to build and read/write''' + raise unittest.SkipTest('Cannot run test unless setUpContainer is implemented') + + def setUp(self): + self.container = self.setUpContainer() + self.object_id = self.container.object_id + self.container_type = self.container.__class__.__name__ + self.filename = 'test_%s.h5' % self.container_type + self.writer = None + self.reader = None + + def tearDown(self): + if self.writer is not None: + self.writer.close() + if self.reader is not None: + self.reader.close() + if os.path.exists(self.filename) and os.getenv("CLEAN_HDMF", '1') not in ('0', 'false', 'FALSE', 'False'): + os.remove(self.filename) + + def roundtripContainer(self, cache_spec=False): + self.writer = HDF5IO(self.filename, manager=get_manager(), mode='w') + self.writer.write(self.container, cache_spec=cache_spec) + self.writer.close() + self.reader = HDF5IO(self.filename, manager=get_manager(), mode='r') + try: + return self.reader.read() + except Exception as e: + self.reader.close() + self.reader = None + raise e + + def test_roundtrip(self): + self.read_container = self.roundtripContainer() + # make sure we get a completely new object + self.assertIsNotNone(str(self.container)) # added as a test to make sure printing works + self.assertIsNotNone(str(self.read_container)) + self.assertNotEqual(id(self.container), id(self.read_container)) + self.assertContainerEqual(self.read_container, self.container) + self.reader.close() + self.validate() + + def validate(self): + # validate created file + if os.path.exists(self.filename): + with HDF5IO(self.filename, manager=get_manager(), mode='r') as io: + errors = common_validate(io) + if errors: + for err in errors: + raise Exception(err) diff --git a/tests/unit/common/test_sparse.py b/tests/unit/common/test_sparse.py new file mode 100644 index 000000000..dd4d79dcb --- /dev/null +++ b/tests/unit/common/test_sparse.py @@ -0,0 +1,23 @@ +from hdmf.common import CSRMatrix + +from . import base + +import scipy.sparse as sps +import numpy as np + + +class TestCSRMatrix(base.TestMapRoundTrip): + + def setUp(self): + self.data = np.array([1, 2, 3, 4, 5, 6]) + self.indices = np.array([0, 2, 2, 0, 1, 2]) + self.indptr = np.array([0, 2, 3, 6]) + super().setUp() + + def setUpContainer(self): + return CSRMatrix(self.data, self.indices, self.indptr, (3, 3)) + + def test_from_sparse_matrix(self): + sps_mat = sps.csr_matrix((self.data, self.indices, self.indptr), shape=(3, 3)) + csr_container = CSRMatrix(sps_mat) + self.assertContainerEqual(csr_container, self.container) diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py new file mode 100644 index 000000000..27af57c22 --- /dev/null +++ b/tests/unit/common/test_table.py @@ -0,0 +1,247 @@ +import unittest2 as unittest + +from hdmf.common import DynamicTable, VectorData, ElementIdentifiers, DynamicTableRegion + +from . import base + +import pandas as pd +import numpy as np + + +class TestDynamicTable(unittest.TestCase): + + def setUp(self): + self.spec = [ + {'name': 'foo', 'description': 'foo column'}, + {'name': 'bar', 'description': 'bar column'}, + {'name': 'baz', 'description': 'baz column'}, + ] + self.data = [ + [1, 2, 3, 4, 5], + [10.0, 20.0, 30.0, 40.0, 50.0], + ['cat', 'dog', 'bird', 'fish', 'lizard'] + ] + + def with_table_columns(self): + cols = [VectorData(**d) for d in self.spec] + table = DynamicTable("with_table_columns", 'a test table', columns=cols) + return table + + def with_columns_and_data(self): + columns = [ + VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data) + ] + return DynamicTable("with_columns_and_data", 'a test table', columns=columns) + + def with_spec(self): + table = DynamicTable("with_spec", 'a test table', columns=self.spec) + return table + + def check_empty_table(self, table): + self.assertIsInstance(table.columns[0], VectorData) + self.assertEqual(len(table.columns), 3) + self.assertEqual(table.colnames, ('foo', 'bar', 'baz')) + + def test_constructor_table_columns(self): + table = self.with_table_columns() + self.assertEqual(table.name, 'with_table_columns') + self.check_empty_table(table) + + def test_constructor_spec(self): + table = self.with_spec() + self.assertEqual(table.name, 'with_spec') + self.check_empty_table(table) + + def check_table(self, table): + self.assertEqual(len(table), 5) + self.assertEqual(table.columns[0].data, [1, 2, 3, 4, 5]) + self.assertEqual(table.columns[1].data, [10.0, 20.0, 30.0, 40.0, 50.0]) + self.assertEqual(table.columns[2].data, ['cat', 'dog', 'bird', 'fish', 'lizard']) + self.assertEqual(table.id.data, [0, 1, 2, 3, 4]) + + def test_constructor_ids_default(self): + columns = [VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data)] + table = DynamicTable("with_spec", 'a test table', columns=columns) + self.check_table(table) + + def test_constructor_ids(self): + columns = [VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data)] + table = DynamicTable("with_columns", 'a test table', id=[0, 1, 2, 3, 4], columns=columns) + self.check_table(table) + + def test_constructor_ElementIdentifier_ids(self): + columns = [VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data)] + ids = ElementIdentifiers('ids', [0, 1, 2, 3, 4]) + table = DynamicTable("with_columns", 'a test table', id=ids, columns=columns) + self.check_table(table) + + def test_constructor_ids_bad_ids(self): + columns = [VectorData(name=s['name'], description=s['description'], data=d) + for s, d in zip(self.spec, self.data)] + msg = "must provide same number of ids as length of columns" + with self.assertRaisesRegex(ValueError, msg): + DynamicTable("with_columns", 'a test table', id=[0, 1], columns=columns) + + def add_rows(self, table): + table.add_row({'foo': 1, 'bar': 10.0, 'baz': 'cat'}) + table.add_row({'foo': 2, 'bar': 20.0, 'baz': 'dog'}) + table.add_row({'foo': 3, 'bar': 30.0, 'baz': 'bird'}) + table.add_row({'foo': 4, 'bar': 40.0, 'baz': 'fish'}) + table.add_row({'foo': 5, 'bar': 50.0, 'baz': 'lizard'}) + + def test_add_row(self): + table = self.with_spec() + self.add_rows(table) + self.check_table(table) + + def test_get_item(self): + table = self.with_spec() + self.add_rows(table) + self.check_table(table) + + def test_add_column(self): + table = self.with_spec() + table.add_column(name='qux', description='qux column') + self.assertEqual(table.colnames, ('foo', 'bar', 'baz', 'qux')) + + def test_getitem_row_num(self): + table = self.with_spec() + self.add_rows(table) + row = table[2] + self.assertEqual(row[0], 2) + self.assertEqual(row[1], 3) + self.assertEqual(row[2], 30.0) + self.assertEqual(row[3], 'bird') + + def test_getitem_column(self): + table = self.with_spec() + self.add_rows(table) + col = table['bar'] + self.assertEqual(col[0], 10.0) + self.assertEqual(col[1], 20.0) + self.assertEqual(col[2], 30.0) + self.assertEqual(col[3], 40.0) + self.assertEqual(col[4], 50.0) + + def test_getitem_list_idx(self): + table = self.with_spec() + self.add_rows(table) + row = table[[0, 2, 4]] + self.assertEqual(len(row), 3) + self.assertEqual(row[0], (0, 1, 10.0, 'cat')) + self.assertEqual(row[1], (2, 3, 30.0, 'bird')) + self.assertEqual(row[2], (4, 5, 50.0, 'lizard')) + + def test_getitem_point_idx_colname(self): + table = self.with_spec() + self.add_rows(table) + val = table[2, 'bar'] + self.assertEqual(val, 30.0) + + def test_getitem_point_idx_colidx(self): + table = self.with_spec() + self.add_rows(table) + val = table[2, 2] + self.assertEqual(val, 30.0) + + def test_pandas_roundtrip(self): + df = pd.DataFrame({ + 'a': [1, 2, 3, 4], + 'b': ['a', 'b', 'c', '4'] + }, index=pd.Index(name='an_index', data=[2, 4, 6, 8])) + + table = DynamicTable.from_dataframe(df, 'foo') + obtained = table.to_dataframe() + + assert df.equals(obtained) + + def test_to_dataframe(self): + table = self.with_columns_and_data() + expected_df = pd.DataFrame({ + 'foo': [1, 2, 3, 4, 5], + 'bar': [10.0, 20.0, 30.0, 40.0, 50.0], + 'baz': ['cat', 'dog', 'bird', 'fish', 'lizard'] + }) + obtained_df = table.to_dataframe() + assert expected_df.equals(obtained_df) + + def test_from_dataframe(self): + df = pd.DataFrame({ + 'foo': [1, 2, 3, 4, 5], + 'bar': [10.0, 20.0, 30.0, 40.0, 50.0], + 'baz': ['cat', 'dog', 'bird', 'fish', 'lizard'] + }).loc[:, ('foo', 'bar', 'baz')] + + obtained_table = DynamicTable.from_dataframe(df, 'test') + self.check_table(obtained_table) + + def test_missing_columns(self): + table = self.with_spec() + + with self.assertRaises(ValueError): + table.add_row({'bar': 60.0, 'foo': [6]}, None) + + def test_extra_columns(self): + table = self.with_spec() + + with self.assertRaises(ValueError): + table.add_row({'bar': 60.0, 'foo': 6, 'baz': 'oryx', 'qax': -1}, None) + + def test_indexed_dynamic_table_region(self): + table = self.with_columns_and_data() + + dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 1], 'desc', table=table) + fetch_ids = [x[1] for x in dynamic_table_region[:3]] + self.assertEqual(fetch_ids, [1, 2, 2]) + + def test_dynamic_table_iteration(self): + table = self.with_columns_and_data() + + dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 2, 3, 4], 'desc', table=table) + for ii, item in enumerate(dynamic_table_region): + self.assertEqual(table[ii], item) + + def test_nd_array_to_df(self): + data = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) + col = VectorData(name='name', description='desc', data=data) + df = DynamicTable('test', 'desc', np.arange(3, dtype='int'), (col, )).to_dataframe() + df2 = pd.DataFrame({'name': [x for x in data]}, + index=pd.Index(name='id', data=[0, 1, 2])) + pd.testing.assert_frame_equal(df, df2) + + +class TestDynamicTableRoundTrip(base.TestMapRoundTrip): + + def setUpContainer(self): + # this will get ignored + table = DynamicTable('table0', 'an example table') + table.add_column('foo', 'an int column') + table.add_column('bar', 'a float column') + table.add_column('baz', 'a string column') + table.add_column('qux', 'a boolean column') + table.add_row(foo=27, bar=28.0, baz="cat", qux=True) + table.add_row(foo=37, bar=38.0, baz="dog", qux=False) + return table + + def test_from_dataframe(self): + # this will get ignored + expected = DynamicTable('test_table', 'the expected table') + expected.add_column('a', '2d column') + expected.add_column('b', '1d column') + expected.add_row(a=[1, 2, 3], b='4') + expected.add_row(a=[1, 2, 3], b='5') + expected.add_row(a=[1, 2, 3], b='6') + + coldesc = {'a': '2d column', 'b': '1d column'} + + received = DynamicTable.from_dataframe(pd.DataFrame({ + 'a': [[1, 2, 3], + [1, 2, 3], + [1, 2, 3]], + 'b': ['4', '5', '6'] + }), 'test_table', table_description='the expected table', column_descriptions=coldesc) + self.assertContainerEqual(expected, received) diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py index 27452d24c..cee76be05 100644 --- a/tests/unit/test_container.py +++ b/tests/unit/test_container.py @@ -1,6 +1,6 @@ import unittest2 as unittest -from hdmf.container import Container, Data +from hdmf.container import AbstractContainer, Container, Data class Subcontainer(Container): @@ -112,22 +112,11 @@ def test_reassign_container_source(self): def test_repr(self): parent_obj = Container('obj1') - self.assertRegex(str(parent_obj), r"") + self.assertRegex(str(parent_obj), r"obj1 hdmf.container.Container at 0x\d+") def test_type_hierarchy(self): - self.assertEqual(Container.type_hierarchy(), (Container, object)) - self.assertEqual(Subcontainer.type_hierarchy(), (Subcontainer, Container, object)) - - -class SubData(Data): - - def __init__(self, name, data): - super(SubData, self).__init__(name=name) - self.__data = data - - @property - def data(self): - return self.__data + self.assertEqual(Container.type_hierarchy(), (Container, AbstractContainer, object)) + self.assertEqual(Subcontainer.type_hierarchy(), (Subcontainer, Container, AbstractContainer, object)) class TestData(unittest.TestCase): @@ -135,26 +124,15 @@ class TestData(unittest.TestCase): def test_bool_true(self): """Test that __bool__ method works correctly on data with len """ - data_obj = SubData('my_data', [1, 2, 3, 4, 5]) + data_obj = Data('my_data', [1, 2, 3, 4, 5]) self.assertTrue(data_obj) def test_bool_false(self): """Test that __bool__ method works correctly on empty data """ - data_obj = SubData('my_data', '') + data_obj = Data('my_data', []) self.assertFalse(data_obj) - data_obj = SubData('my_data', []) - self.assertFalse(data_obj) - - def test_bool_no_len(self): - """Test that__bool__ method works correctly on data with no len - """ - data_obj = SubData('my_data', Container('')) - err_msg = '__bool__ must be implemented when data has no __len__' - with self.assertRaisesRegex(NotImplementedError, err_msg): - bool(data_obj) - if __name__ == '__main__': unittest.main()