From e9c273774b3367c3e6acb7b538e696a60e575d31 Mon Sep 17 00:00:00 2001
From: Ryan Ly <rly@lbl.gov>
Date: Tue, 12 Mar 2024 22:11:58 +0000
Subject: [PATCH 01/13] Add docs on spec language support (#1069)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                          |  5 +++++
 docs/source/index.rst                 |  1 +
 docs/source/spec_language_support.rst | 21 +++++++++++++++++++++
 3 files changed, 27 insertions(+)
 create mode 100644 docs/source/spec_language_support.rst

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1294aee02..cd7b69fcb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # HDMF Changelog
 
+## HDMF 3.13.0 (Upcoming)
+
+### Enhancements
+- Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
+
 ## HDMF 3.12.2 (February 9, 2024)
 
 ### Bug fixes
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e6a53d3ab..2fcd4778a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -45,6 +45,7 @@ If you use HDMF in your research, please use the following citation:
    building_api
    export
    validation
+   spec_language_support
 
 .. toctree::
    :hidden:
diff --git a/docs/source/spec_language_support.rst b/docs/source/spec_language_support.rst
new file mode 100644
index 000000000..43a628093
--- /dev/null
+++ b/docs/source/spec_language_support.rst
@@ -0,0 +1,21 @@
+
+..  _spec_language_support:
+
+===========================================
+Support for the HDMF Specification Language
+===========================================
+
+The HDMF API provides nearly full support for all features of the `HDMF Specification Language`_
+version 3.0.0, except for the following:
+
+1. Attributes containing multiple references (see `#833`_)
+2. Certain text and integer values for quantity (see `#423`_, `#531`_)
+3. Datasets that do not have a data_type_inc/data_type_def and contain either a reference dtype or a compound dtype (see `#737`_)
+4. Passing dataset dtype and shape from parent data type to child data type (see `#320`_)
+
+.. _HDMF Specification Language: https://hdmf-schema-language.readthedocs.io
+.. _#833: https://github.com/hdmf-dev/hdmf/issues/833
+.. _#423: https://github.com/hdmf-dev/hdmf/issues/423
+.. _#531: https://github.com/hdmf-dev/hdmf/issues/531
+.. _#737: https://github.com/hdmf-dev/hdmf/issues/737
+.. _#320: https://github.com/hdmf-dev/hdmf/issues/320

From f092cbbe5d5f110246c8b7518e118add26aa5203 Mon Sep 17 00:00:00 2001
From: Steph Prince <40640337+stephprince@users.noreply.github.com>
Date: Thu, 14 Mar 2024 09:16:22 -0700
Subject: [PATCH 02/13] Warn when adding ragged arrays to DynamicTable without
 index argument (#1066)

* add detection of ragged array inputs to table

* add tests for ragged array inputs to table

* add warnings for ragged inputs to table

* update CHANGELOG.md

* check only lists and tuples for raggedness

* add flag to turn off ragged data checks

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                    |  1 +
 src/hdmf/common/table.py        | 26 +++++++++++--
 src/hdmf/utils.py               | 14 +++++++
 tests/unit/common/test_table.py | 68 +++++++++++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd7b69fcb..8eddf8270 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Enhancements
 - Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
+- Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)
 
 ## HDMF 3.12.2 (February 9, 2024)
 
diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py
index 5eeedcd86..3b67ff19d 100644
--- a/src/hdmf/common/table.py
+++ b/src/hdmf/common/table.py
@@ -15,7 +15,7 @@
 from . import register_class, EXP_NAMESPACE
 from ..container import Container, Data
 from ..data_utils import DataIO, AbstractDataChunkIterator
-from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type
+from ..utils import docval, getargs, ExtenderMeta, popargs, pystr, AllowPositional, check_type, is_ragged
 from ..term_set import TermSetWrapper
 
 
@@ -639,12 +639,16 @@ def __len__(self):
             {'name': 'id', 'type': int, 'doc': 'the ID for the row', 'default': None},
             {'name': 'enforce_unique_id', 'type': bool, 'doc': 'enforce that the id in the table must be unique',
              'default': False},
+            {'name': 'check_ragged', 'type': bool, 'default': True,
+             'doc': ('whether or not to check for ragged arrays when adding data to the table. '
+                     'Set to False to avoid checking every element if performance issues occur.')},
             allow_extra=True)
     def add_row(self, **kwargs):
         """
         Add a row to the table. If *id* is not provided, it will auto-increment.
         """
-        data, row_id, enforce_unique_id = popargs('data', 'id', 'enforce_unique_id', kwargs)
+        data, row_id, enforce_unique_id, check_ragged = popargs('data', 'id', 'enforce_unique_id', 'check_ragged',
+                                                                kwargs)
         data = data if data is not None else kwargs
 
         bad_data = []
@@ -709,6 +713,11 @@ def add_row(self, **kwargs):
                 c.add_vector(data[colname])
             else:
                 c.add_row(data[colname])
+                if check_ragged and is_ragged(c.data):
+                    warn(("Data has elements with different lengths and therefore cannot be coerced into an "
+                          "N-dimensional array. Use the 'index' argument when creating a column to add rows "
+                          "with different lengths."),
+                         stacklevel=2)
 
     def __eq__(self, other):
         """Compare if the two DynamicTables contain the same data.
@@ -748,6 +757,9 @@ def __eq__(self, other):
              'doc': ('class to use to represent the column data. If table=True, this field is ignored and a '
                      'DynamicTableRegion object is used. If enum=True, this field is ignored and a EnumData '
                      'object is used.')},
+            {'name': 'check_ragged', 'type': bool, 'default': True,
+             'doc': ('whether or not to check for ragged arrays when adding data to the table. '
+                     'Set to False to avoid checking every element if performance issues occur.')},
             allow_extra=True)
     def add_column(self, **kwargs):  # noqa: C901
         """
@@ -760,7 +772,7 @@ def add_column(self, **kwargs):  # noqa: C901
         :raises ValueError: if the column has already been added to the table
         """
         name, data = getargs('name', 'data', kwargs)
-        index, table, enum, col_cls= popargs('index', 'table', 'enum', 'col_cls', kwargs)
+        index, table, enum, col_cls, check_ragged = popargs('index', 'table', 'enum', 'col_cls', 'check_ragged', kwargs)
 
         if isinstance(index, VectorIndex):
             warn("Passing a VectorIndex in for index may lead to unexpected behavior. This functionality will be "
@@ -823,6 +835,14 @@ def add_column(self, **kwargs):  # noqa: C901
         # once we have created the column
         create_vector_index = None
         if ckwargs.get('data', None) is not None:
+
+            # if no index was provided, check that data is not ragged
+            if index is False and check_ragged and is_ragged(data):
+                warn(("Data has elements with different lengths and therefore cannot be coerced into an "
+                      "N-dimensional array. Use the 'index' argument when adding a column of data with "
+                      "different lengths."),
+                     stacklevel=2)
+
             # Check that we are asked to create an index
             if (isinstance(index, bool) or isinstance(index, int)) and index > 0 and len(data) > 0:
                 # Iteratively flatten the data we use for the column based on the depth of the index to generate.
diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py
index 57a4bb465..5e0b61539 100644
--- a/src/hdmf/utils.py
+++ b/src/hdmf/utils.py
@@ -954,6 +954,20 @@ def to_uint_array(arr):
     raise ValueError('Cannot convert array of dtype %s to uint.' % arr.dtype)
 
 
+def is_ragged(data):
+    """
+    Test whether a list of lists or array is ragged / jagged
+    """
+    if isinstance(data, (list, tuple)):
+        lengths = [len(sub_data) if isinstance(sub_data, (list, tuple)) else 1 for sub_data in data]
+        if len(set(lengths)) > 1:
+            return True  # ragged at this level
+
+        return any(is_ragged(sub_data) for sub_data in data)  # check next level
+
+    return False
+
+
 class LabelledDict(dict):
     """A dict wrapper that allows querying by an attribute of the values and running a callable on removed items.
 
diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py
index 7246a8ba8..d98add060 100644
--- a/tests/unit/common/test_table.py
+++ b/tests/unit/common/test_table.py
@@ -354,6 +354,74 @@ def test_add_column_multi_index(self):
                       ]
                       )
 
+    def test_add_column_without_required_index(self):
+        """
+        Add a column with different element lengths without specifying an index parameter
+        """
+        table = self.with_spec()
+        table.add_row(foo=5, bar=50.0, baz='lizard')
+        table.add_row(foo=5, bar=50.0, baz='lizard')
+
+        # testing adding column without a necessary index parameter
+        lol_data = [[1, 2, 3], [1, 2, 3, 4]]
+        str_data = [['a', 'b'], ['a', 'b', 'c']]
+        empty_data = [[1, 2], []]
+        multi_nested_data = [[[1, 2, 3], [1, 2, 3, 4]], [1, 2]]
+        tuple_data = ((1, 2, 3), (1, 2, 3, 4))
+
+        msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
+               "array. Use the 'index' argument when adding a column of data with different lengths.")
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_column(name='col1', description='', data=lol_data,)
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_column(name='col2', description='', data=str_data,)
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_column(name='col3', description='', data=empty_data,)
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_column(name='col4', description='', data=multi_nested_data,)
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_column(name='col5', description='', data=tuple_data,)
+
+    def test_add_column_without_required_index_and_no_ragged_check(self):
+        """
+        Add a column with different element lengths without checking for raggedness
+        """
+        lol_data = [[1, 2, 3], [1, 2, 3, 4]]
+        table = self.with_spec()
+        table.add_row(foo=5, bar=50.0, baz='lizard')
+        table.add_row(foo=5, bar=50.0, baz='lizard')
+        table.add_column(name='col1', description='', data=lol_data, check_ragged=False)
+
+    def test_add_row_without_required_index(self):
+        """
+        Add rows with different element lengths without specifying an index parameter
+        """
+
+        # test adding row of list data with different lengths without index parameter
+        msg = ("Data has elements with different lengths and therefore cannot be coerced into an N-dimensional "
+               "array. Use the 'index' argument when creating a column to add rows with different lengths.")
+        table = self.with_spec()
+        table.add_column(name='qux', description='qux column')
+        table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4])
+
+        # test adding row of tuple/str data with different lengths without index parameter
+        table = self.with_spec()
+        table.add_column(name='qux', description='qux column')
+        table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b'))
+        with self.assertWarnsWith(UserWarning, msg):
+            table.add_row(foo=5, bar=50.0, baz='lizard', qux=('a', 'b', 'c'))
+
+    def test_add_row_without_required_index_and_no_ragged_check(self):
+        """
+        Add rows with different element lengths without checking for raggedness
+        """
+        table = self.with_spec()
+        table.add_column(name='qux', description='qux column')
+        table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3])
+        table.add_row(foo=5, bar=50.0, baz='lizard', qux=[1, 2, 3 ,4], check_ragged=False)
+
     def test_add_column_auto_index_int(self):
         """
         Add a column as a list of lists after we have already added data so that we need to create a single VectorIndex

From ab18840f3e7014b19deb896e3e3d96df875eff5a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 Mar 2024 22:38:20 +0000
Subject: [PATCH 03/13] [pre-commit.ci] pre-commit autoupdate (#1056)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ad798b8f7..d48d8e48d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
 #     hooks:
 #     -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.2.0
+    rev: v0.3.2
     hooks:
     -   id: ruff
 # -   repo: https://github.com/econchick/interrogate

From 437990d1cedc71c34fa20f142a786643677f5f3e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 18 Mar 2024 22:44:48 +0000
Subject: [PATCH 04/13] Bump actions/add-to-project from 0.5.0 to 0.6.0 (#1063)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 .github/workflows/project_action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/project_action.yml b/.github/workflows/project_action.yml
index bfca0b3f5..5f13d9540 100644
--- a/.github/workflows/project_action.yml
+++ b/.github/workflows/project_action.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Add to Developer Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v0.5.0
+        uses: actions/add-to-project@v0.6.0
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/7
           github-token: ${{ env.TOKEN }}
@@ -28,7 +28,7 @@ jobs:
       - name: Add to Community Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v0.5.0
+        uses: actions/add-to-project@v0.6.0
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/8
           github-token: ${{ env.TOKEN }}

From bd3e150238987468a9c479b9aea31df5837cadc2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:00:47 -0700
Subject: [PATCH 05/13] [pre-commit.ci] pre-commit autoupdate (#1071)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d48d8e48d..786a3e4b7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
 #     hooks:
 #     -   id: black
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.3.3
     hooks:
     -   id: ruff
 # -   repo: https://github.com/econchick/interrogate

From c79d2384196fc2b92d38d44725313c31ffce71a2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 20 Mar 2024 17:34:06 +0000
Subject: [PATCH 06/13] Bump black from 23.10.1 to 24.3.0 (#1075)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index f61962728..1d856e4e7 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,7 +2,7 @@
 # compute coverage, and create test environments. note that depending on the version of python installed, different
 # versions of requirements may be installed due to package incompatibilities.
 #
-black==23.10.1
+black==24.3.0
 codespell==2.2.6
 coverage==7.3.2
 pre-commit==3.5.0

From a6f51ff6083914579f1e44e9f47a122c3df950ed Mon Sep 17 00:00:00 2001
From: Matthew Avaylon <mavaylon1@berkeley.edu>
Date: Wed, 20 Mar 2024 12:27:48 -0700
Subject: [PATCH 07/13] Release 3.13 (#1074)

* Update CHANGELOG.md

* Update release.md

* updates

* Update .github/PULL_REQUEST_TEMPLATE/release.md

Co-authored-by: Steph Prince <40640337+stephprince@users.noreply.github.com>

* Update pyproject.toml

---------

Co-authored-by: Steph Prince <40640337+stephprince@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE/release.md |  2 +-
 CHANGELOG.md                             |  2 +-
 pyproject.toml                           |  1 +
 requirements-opt.txt                     | 12 ++++++------
 src/hdmf/common/hdmf-common-schema       |  2 +-
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/release.md b/.github/PULL_REQUEST_TEMPLATE/release.md
index 11bd20bfa..7c5ff5ece 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release.md
@@ -10,7 +10,7 @@ Prepare for release of HDMF [version]
   and any other locations as needed
 - [ ] Update `pyproject.toml` as needed
 - [ ] Update `README.rst` as needed
-- [ ] Update `src/hdmf/common/hdmf-common-schema` submodule as needed. Check the version number and commit SHA manually
+- [ ] Update `src/hdmf/common/hdmf-common-schema` submodule as needed. Check the version number and commit SHA manually. Make sure we are using the latest release and not the latest commit on the `main` branch.
 - [ ] Update changelog (set release date) in `CHANGELOG.md` and any other docs as needed
 - [ ] Run tests locally including gallery tests, and inspect all warnings and outputs
   (`pytest && python test_gallery.py`)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8eddf8270..fb7a71e00 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # HDMF Changelog
 
-## HDMF 3.13.0 (Upcoming)
+## HDMF 3.13.0 (March 20, 2024)
 
 ### Enhancements
 - Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
diff --git a/pyproject.toml b/pyproject.toml
index ee8037be5..b60ae6943 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
     "License :: OSI Approved :: BSD License",
     "Development Status :: 5 - Production/Stable",
     "Operating System :: OS Independent",
diff --git a/requirements-opt.txt b/requirements-opt.txt
index 644fc80be..11cd23e17 100644
--- a/requirements-opt.txt
+++ b/requirements-opt.txt
@@ -1,8 +1,8 @@
 # pinned dependencies that are optional. used to reproduce an entire development environment to use HDMF
-tqdm==4.66.1
-zarr==2.16.1
-linkml-runtime==1.6.0; python_version >= "3.9"
-schemasheets==0.1.24; python_version >= "3.9"
-oaklib==0.5.20; python_version >= "3.9"
-pydantic==1.10.13  # linkml-runtime 1.6.0 and related packages require pydantic<2
+tqdm==4.66.2
+zarr==2.17.1
+linkml-runtime==1.7.3; python_version >= "3.9"
+schemasheets==0.2.1; python_version >= "3.9"
+oaklib==0.5.31; python_version >= "3.9"
+pydantic==2.6.4  # linkml-runtime 1.6.0 and related packages require pydantic<2
 pyyaml==6.0.1; python_version >= "3.9"
diff --git a/src/hdmf/common/hdmf-common-schema b/src/hdmf/common/hdmf-common-schema
index 4d2ddd638..5b4cbb31d 160000
--- a/src/hdmf/common/hdmf-common-schema
+++ b/src/hdmf/common/hdmf-common-schema
@@ -1 +1 @@
-Subproject commit 4d2ddd6387c4e36f21f41964fe8873c083680b15
+Subproject commit 5b4cbb31dbafcff51ca70bf218f464b186568151

From 67b82621296debf8202a23da5fca0a5bc9d6b9cf Mon Sep 17 00:00:00 2001
From: Ryan Ly <rly@lbl.gov>
Date: Wed, 20 Mar 2024 13:36:02 -0700
Subject: [PATCH 08/13] Remove outdated comment in requirements-opt.txt (#1076)

---
 requirements-opt.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-opt.txt b/requirements-opt.txt
index 11cd23e17..6b4e102f1 100644
--- a/requirements-opt.txt
+++ b/requirements-opt.txt
@@ -4,5 +4,5 @@ zarr==2.17.1
 linkml-runtime==1.7.3; python_version >= "3.9"
 schemasheets==0.2.1; python_version >= "3.9"
 oaklib==0.5.31; python_version >= "3.9"
-pydantic==2.6.4  # linkml-runtime 1.6.0 and related packages require pydantic<2
+pydantic==2.6.4
 pyyaml==6.0.1; python_version >= "3.9"

From 5c8506216995f995b891da1e6b596ee42b7dd948 Mon Sep 17 00:00:00 2001
From: Matthew Avaylon <mavaylon1@berkeley.edu>
Date: Wed, 20 Mar 2024 16:16:43 -0700
Subject: [PATCH 09/13] Unwrap TermSetWrapper in ObjectMapper and not the IO
 (#1070)

* Unwrap TermSetWrapper in ObjectMapper and no the IO

* ruff

* test

* test

* test

* Update builders.py

* Update manager.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update CHANGELOG.md

* revert

* test

* test

* test

* test

* tesT

* tesT

* tesT

* tesT

* move

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 CHANGELOG.md                                  |  1 +
 src/hdmf/backends/hdf5/h5tools.py             |  5 ----
 src/hdmf/build/objectmapper.py                |  6 ++++-
 .../build_tests/mapper_tests/test_build.py    | 26 ++++++++++++++++++-
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb7a71e00..f35a06cd1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## HDMF 3.13.0 (March 20, 2024)
 
 ### Enhancements
+- Unwrap `TermSetWrapper` within the builder to support different backends more efficiently. @mavaylon1 [#1070](https://github.com/hdmf-dev/hdmf/pull/1070)
 - Added docs page that lists limitations of support for the HDMF specification language. @rly [#1069](https://github.com/hdmf-dev/hdmf/pull/1069)
 - Added warning when using `add_row` or `add_column` to add a ragged array to `DynamicTable` without an index parameter. @stephprince [#1066](https://github.com/hdmf-dev/hdmf/pull/1066)
 
diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py
index 7a644f0b7..05ce36e13 100644
--- a/src/hdmf/backends/hdf5/h5tools.py
+++ b/src/hdmf/backends/hdf5/h5tools.py
@@ -17,7 +17,6 @@
 from ...build import (Builder, GroupBuilder, DatasetBuilder, LinkBuilder, BuildManager, RegionBuilder,
                       ReferenceBuilder, TypeMap, ObjectMapper)
 from ...container import Container
-from ...term_set import TermSetWrapper
 from ...data_utils import AbstractDataChunkIterator
 from ...spec import RefSpec, DtypeSpec, NamespaceCatalog
 from ...utils import docval, getargs, popargs, get_data_shape, get_docval, StrDataset
@@ -1103,10 +1102,6 @@ def write_dataset(self, **kwargs):  # noqa: C901
             data = data.data
         else:
             options['io_settings'] = {}
-        if isinstance(data, TermSetWrapper):
-            # This is for when the wrapped item is a dataset
-            # (refer to objectmapper.py for wrapped attributes)
-            data = data.value
         attributes = builder.attributes
         options['dtype'] = builder.dtype
         dset = None
diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py
index b8e50d104..fed678d41 100644
--- a/src/hdmf/build/objectmapper.py
+++ b/src/hdmf/build/objectmapper.py
@@ -752,7 +752,11 @@ def build(self, **kwargs):
                                           % (container.__class__.__name__, container.name, repr(source)))
                         try:
                             # use spec_dtype from self.spec when spec_ext does not specify dtype
-                            bldr_data, dtype = self.convert_dtype(spec, container.data, spec_dtype=spec_dtype)
+                            if isinstance(container.data, TermSetWrapper):
+                                data = container.data.value
+                            else:
+                                data = container.data
+                            bldr_data, dtype = self.convert_dtype(spec, data, spec_dtype=spec_dtype)
                         except Exception as ex:
                             msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
                             raise Exception(msg) from ex
diff --git a/tests/unit/build_tests/mapper_tests/test_build.py b/tests/unit/build_tests/mapper_tests/test_build.py
index 8590f29f2..b90ad6f1a 100644
--- a/tests/unit/build_tests/mapper_tests/test_build.py
+++ b/tests/unit/build_tests/mapper_tests/test_build.py
@@ -1,7 +1,8 @@
 from abc import ABCMeta, abstractmethod
 
 import numpy as np
-from hdmf import Container, Data
+from hdmf import Container, Data, TermSet, TermSetWrapper
+from hdmf.common import VectorData, get_type_map
 from hdmf.build import ObjectMapper, BuildManager, TypeMap, GroupBuilder, DatasetBuilder
 from hdmf.build.warnings import DtypeConversionWarning
 from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, Spec
@@ -10,6 +11,29 @@
 
 from tests.unit.helpers.utils import CORE_NAMESPACE
 
+try:
+    import linkml_runtime  # noqa: F401
+    LINKML_INSTALLED = True
+except ImportError:
+    LINKML_INSTALLED = False
+
+
+class TestUnwrapTermSetWrapperBuild(TestCase):
+    """
+    Test the unwrapping of TermSetWrapper on regular datasets within build.
+    """
+    def setUp(self):
+        if not LINKML_INSTALLED:
+            self.skipTest("optional LinkML module is not installed")
+
+    def test_unwrap(self):
+        manager = BuildManager(get_type_map())
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        build = manager.build(VectorData(name='test_data',
+                                         description='description',
+                                         data=TermSetWrapper(value=['Homo sapiens'], termset= terms)))
+
+        self.assertEqual(build.data, ['Homo sapiens'])
 
 # TODO: test build of extended group/dataset that modifies an attribute dtype (commented out below), shape, value, etc.
 # by restriction. also check that attributes cannot be deleted or scope expanded.

From 2c03baa50308061ae843d403623bfca5b6a6d84f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 25 Mar 2024 09:48:12 -0700
Subject: [PATCH 10/13] Bump actions/add-to-project from 0.6.0 to 0.6.1 (#1078)

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/project_action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/project_action.yml b/.github/workflows/project_action.yml
index 5f13d9540..5d141d1d1 100644
--- a/.github/workflows/project_action.yml
+++ b/.github/workflows/project_action.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Add to Developer Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v0.6.0
+        uses: actions/add-to-project@v0.6.1
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/7
           github-token: ${{ env.TOKEN }}
@@ -28,7 +28,7 @@ jobs:
       - name: Add to Community Board
         env:
           TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v0.6.0
+        uses: actions/add-to-project@v0.6.1
         with:
           project-url: https://github.com/orgs/hdmf-dev/projects/8
           github-token: ${{ env.TOKEN }}

From 000020232cbf6dcfb053c4d57d984445c2eaa0e7 Mon Sep 17 00:00:00 2001
From: Ryan Ly <rly@lbl.gov>
Date: Mon, 25 Mar 2024 10:16:36 -0700
Subject: [PATCH 11/13] Update GitHub release checklist (#1080)

* Update release.md

- Add a step to manually run all tests before merging
- Minor updates to other steps

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update release.md

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Matthew Avaylon <mavaylon1@berkeley.edu>
---
 .github/PULL_REQUEST_TEMPLATE/release.md | 26 +++++++++++++++---------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE/release.md b/.github/PULL_REQUEST_TEMPLATE/release.md
index 7c5ff5ece..86a7ad57d 100644
--- a/.github/PULL_REQUEST_TEMPLATE/release.md
+++ b/.github/PULL_REQUEST_TEMPLATE/release.md
@@ -1,6 +1,7 @@
 Prepare for release of HDMF [version]
 
 ### Before merging:
+- [ ] Make sure all PRs to be included in this release have been merged to `dev`.
 - [ ] Major and minor releases: Update package versions in `requirements.txt`, `requirements-dev.txt`,
   `requirements-doc.txt`, `requirements-opt.txt`, and `environment-ros3.yml` to the latest versions,
   and update dependency ranges in `pyproject.toml` and minimums in `requirements-min.txt` as needed.
@@ -10,23 +11,28 @@ Prepare for release of HDMF [version]
   and any other locations as needed
 - [ ] Update `pyproject.toml` as needed
 - [ ] Update `README.rst` as needed
-- [ ] Update `src/hdmf/common/hdmf-common-schema` submodule as needed. Check the version number and commit SHA manually. Make sure we are using the latest release and not the latest commit on the `main` branch.
+- [ ] Update `src/hdmf/common/hdmf-common-schema` submodule as needed. Check the version number and commit SHA
+  manually. Make sure we are using the latest release and not the latest commit on the `main` branch.
 - [ ] Update changelog (set release date) in `CHANGELOG.md` and any other docs as needed
 - [ ] Run tests locally including gallery tests, and inspect all warnings and outputs
-  (`pytest && python test_gallery.py`)
+  (`pytest && python test_gallery.py`). Try to remove all warnings.
 - [ ] Run PyNWB tests locally including gallery and validation tests, and inspect all warnings and outputs
-  (`cd pynwb; python test.py -v > out.txt 2>&1`)
+  (`cd pynwb; git checkout dev; git pull; python test.py -v > out.txt 2>&1`)
 - [ ] Run HDMF-Zarr tests locally including gallery and validation tests, and inspect all warnings and outputs
-  (`cd hdmf-zarr; pytest && python test_gallery.py`)
+  (`cd hdmf-zarr; git checkout dev; git pull; pytest && python test_gallery.py`)
 - [ ] Test docs locally and inspect all warnings and outputs `cd docs; make clean && make html`
-- [ ] Push changes to this PR and make sure all PRs to be included in this release have been merged
-- [ ] Check that the readthedocs build for this PR succeeds (build latest to pull the new branch, then activate and
-  build docs for new branch): https://readthedocs.org/projects/hdmf/builds/
+- [ ] After pushing this branch to GitHub, manually trigger the "Run all tests" GitHub Actions workflow on this
+  branch by going to https://github.com/hdmf-dev/hdmf/actions/workflows/run_all_tests.yml, selecting
+  "Run workflow" on the right, selecting this branch, and clicking "Run workflow". Make sure all tests pass.
+- [ ] Check that the readthedocs build for this PR succeeds (see the PR check)
 
 ### After merging:
 1. Create release by following steps in `docs/source/make_a_release.rst` or use alias `git pypi-release [tag]` if set up
 2. After the CI bot creates the new release (wait ~10 min), update the release notes on the
    [GitHub releases page](https://github.com/hdmf-dev/hdmf/releases) with the changelog
-3. Check that the readthedocs "latest" and "stable" builds run and succeed
-4. Update [conda-forge/hdmf-feedstock](https://github.com/conda-forge/hdmf-feedstock) with the latest version number
-   and SHA256 retrieved from PyPI > HDMF > Download Files > View hashes for the `.tar.gz` file. Re-render as needed
+3. Check that the readthedocs "stable" build runs and succeeds
+4. Either monitor [conda-forge/hdmf-feedstock](https://github.com/conda-forge/hdmf-feedstock) for the
+   regro-cf-autotick-bot bot to create a PR updating the version of HDMF to the latest PyPI release, usually within
+   24 hours of release, or manually create a PR updating `recipe/meta.yaml` with the latest version number
+   and SHA256 retrieved from PyPI > HDMF > Download Files > View hashes for the `.tar.gz` file. Re-render and update
+   dependencies as needed.

From 244d17a28ed436849b1973a3aaac8522d0ea922b Mon Sep 17 00:00:00 2001
From: Matthew Avaylon <mavaylon1@berkeley.edu>
Date: Thu, 28 Mar 2024 10:08:58 -0700
Subject: [PATCH 12/13] Configuration File for TermSet validations (#1016)

* config

* rough draft

* move

* testing

* check

* new way of thinking draft

* support multiple config files

* testing

* placeholder'

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

* Delete docs/gallery/example_config.yaml

* clean up

* clean up

* clean up

* checkpoint

* need to clean

* partial clean up

* warn

* yaml changes

* revert

* except

* clean up

* warning tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* tests

* tests

* tests

* ruff

* update

* update

* cov

* tests

* tests/clean

* coverage'
git push

* coverage'
git push

* final clean ups

* final clean ups

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update src/hdmf/container.py

* Update src/hdmf/container.py

* Update src/hdmf/term_set.py

* Update src/hdmf/term_set.py

* in progress

* Update src/hdmf/container.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update tests/unit/test_term_set.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update tests/unit/test_term_set.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* in progress

* in progress

* in progress

* in progress

* clean tests

* checkpoint of updates

* checkpoint of updates

* checkpoint of updates

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* copy

* clean up

* clean

* Update CHANGELOG.md

* clean up

* clean up

* test copy

* name

* Update CHANGELOG.md

* Update requirements-opt.txt

* Update requirements-opt.txt

* Update container.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update container.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update __init__.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update manager.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* clean

* namespace

* Update src/hdmf/common/__init__.py

* Update src/hdmf/common/__init__.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update __init__.py

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
---
 CHANGELOG.md                                  |   5 +
 requirements-opt.txt                          |   6 +-
 src/hdmf/__init__.py                          |   2 +-
 src/hdmf/build/manager.py                     |  18 ++-
 src/hdmf/common/__init__.py                   |  25 ++++
 src/hdmf/container.py                         |  82 +++++++++++-
 src/hdmf/term_set.py                          |  87 ++++++++++--
 tests/unit/common/test_common.py              |  14 +-
 tests/unit/common/test_table.py               |  17 ++-
 tests/unit/hdmf_config.yaml                   |   9 ++
 tests/unit/hdmf_config2.yaml                  |  18 +++
 tests/unit/test_container.py                  |  15 ++-
 tests/unit/test_term_set.py                   | 124 +++++++++++++++++-
 .../schemasheets/nwb_static_enums.yaml        |  58 +++-----
 14 files changed, 402 insertions(+), 78 deletions(-)
 create mode 100644 tests/unit/hdmf_config.yaml
 create mode 100644 tests/unit/hdmf_config2.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f35a06cd1..22c21b0e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # HDMF Changelog
 
+## HDMF 3.14.0 (Upcoming)
+
+### Enhancements
+- Added `TermSetConfigurator` to automatically wrap fields with `TermSetWrapper` according to a configuration file. @mavaylon1 [#1016](https://github.com/hdmf-dev/hdmf/pull/1016)
+
 ## HDMF 3.13.0 (March 20, 2024)
 
 ### Enhancements
diff --git a/requirements-opt.txt b/requirements-opt.txt
index 6b4e102f1..53fd11e3a 100644
--- a/requirements-opt.txt
+++ b/requirements-opt.txt
@@ -1,8 +1,6 @@
 # pinned dependencies that are optional. used to reproduce an entire development environment to use HDMF
 tqdm==4.66.2
 zarr==2.17.1
-linkml-runtime==1.7.3; python_version >= "3.9"
+linkml-runtime==1.7.4; python_version >= "3.9"
 schemasheets==0.2.1; python_version >= "3.9"
-oaklib==0.5.31; python_version >= "3.9"
-pydantic==2.6.4
-pyyaml==6.0.1; python_version >= "3.9"
+oaklib==0.5.32; python_version >= "3.9"
diff --git a/src/hdmf/__init__.py b/src/hdmf/__init__.py
index 2699a28af..6fc72a117 100644
--- a/src/hdmf/__init__.py
+++ b/src/hdmf/__init__.py
@@ -3,7 +3,7 @@
 from .container import Container, Data, DataRegion, HERDManager
 from .region import ListSlicer
 from .utils import docval, getargs
-from .term_set import TermSet, TermSetWrapper
+from .term_set import TermSet, TermSetWrapper, TypeConfigurator
 
 
 @docval(
diff --git a/src/hdmf/build/manager.py b/src/hdmf/build/manager.py
index 03f2856b8..a26de3279 100644
--- a/src/hdmf/build/manager.py
+++ b/src/hdmf/build/manager.py
@@ -5,6 +5,7 @@
 from .builders import DatasetBuilder, GroupBuilder, LinkBuilder, Builder, BaseBuilder
 from .classgenerator import ClassGenerator, CustomClassGenerator, MCIClassGenerator
 from ..container import AbstractContainer, Container, Data
+from ..term_set import TypeConfigurator
 from ..spec import DatasetSpec, GroupSpec, NamespaceCatalog
 from ..spec.spec import BaseStorageSpec
 from ..utils import docval, getargs, ExtenderMeta, get_docval
@@ -391,18 +392,23 @@ def data_type(self):
 
 
 class TypeMap:
-    ''' A class to maintain the map between ObjectMappers and AbstractContainer classes
-    '''
+    """
+    A class to maintain the map between ObjectMappers and AbstractContainer classes
+    """
 
     @docval({'name': 'namespaces', 'type': NamespaceCatalog, 'doc': 'the NamespaceCatalog to use', 'default': None},
-            {'name': 'mapper_cls', 'type': type, 'doc': 'the ObjectMapper class to use', 'default': None})
+            {'name': 'mapper_cls', 'type': type, 'doc': 'the ObjectMapper class to use', 'default': None},
+            {'name': 'type_config', 'type': TypeConfigurator, 'doc': 'The TypeConfigurator to use.',
+             'default': None})
     def __init__(self, **kwargs):
-        namespaces, mapper_cls = getargs('namespaces', 'mapper_cls', kwargs)
+        namespaces, mapper_cls, type_config = getargs('namespaces', 'mapper_cls', 'type_config', kwargs)
         if namespaces is None:
             namespaces = NamespaceCatalog()
         if mapper_cls is None:
             from .objectmapper import ObjectMapper  # avoid circular import
             mapper_cls = ObjectMapper
+        if type_config is None:
+            type_config = TypeConfigurator()
         self.__ns_catalog = namespaces
         self.__mappers = dict()  # already constructed ObjectMapper classes
         self.__mapper_cls = dict()  # the ObjectMapper class to use for each container type
@@ -410,6 +416,8 @@ def __init__(self, **kwargs):
         self.__data_types = dict()
         self.__default_mapper_cls = mapper_cls
         self.__class_generator = ClassGenerator()
+        self.type_config = type_config
+
         self.register_generator(CustomClassGenerator)
         self.register_generator(MCIClassGenerator)
 
@@ -422,7 +430,7 @@ def container_types(self):
         return self.__container_types
 
     def __copy__(self):
-        ret = TypeMap(copy(self.__ns_catalog), self.__default_mapper_cls)
+        ret = TypeMap(copy(self.__ns_catalog), self.__default_mapper_cls, self.type_config)
         ret.merge(self)
         return ret
 
diff --git a/src/hdmf/common/__init__.py b/src/hdmf/common/__init__.py
index e0782effe..248ca1095 100644
--- a/src/hdmf/common/__init__.py
+++ b/src/hdmf/common/__init__.py
@@ -20,6 +20,31 @@
 # a global type map
 global __TYPE_MAP
 
+@docval({'name': 'config_path', 'type': str, 'doc': 'Path to the configuration file.'},
+        is_method=False)
+def load_type_config(**kwargs):
+    """
+    This method will either load the default config or the config provided by the path.
+    NOTE: This config is global and shared across all type maps.
+    """
+    config_path = kwargs['config_path']
+    __TYPE_MAP.type_config.load_type_config(config_path)
+
+def get_loaded_type_config():
+    """
+    This method returns the entire config file.
+    """
+    if __TYPE_MAP.type_config.config is None:
+        msg = "No configuration is loaded."
+        raise ValueError(msg)
+    else:
+        return __TYPE_MAP.type_config.config
+
+def unload_type_config():
+    """
+    Unload the configuration file.
+    """
+    return __TYPE_MAP.type_config.unload_type_config()
 
 # a function to register a container classes with the global map
 @docval({'name': 'data_type', 'type': str, 'doc': 'the data_type to get the spec for'},
diff --git a/src/hdmf/container.py b/src/hdmf/container.py
index 521568d95..f93c06199 100644
--- a/src/hdmf/container.py
+++ b/src/hdmf/container.py
@@ -5,6 +5,7 @@
 from typing import Type
 from uuid import uuid4
 from warnings import warn
+import os
 
 import h5py
 import numpy as np
@@ -13,6 +14,7 @@
 from .data_utils import DataIO, append_data, extend_data
 from .utils import docval, get_docval, getargs, ExtenderMeta, get_data_shape, popargs, LabelledDict
 
+from .term_set import TermSet, TermSetWrapper
 
 def _set_exp(cls):
     """Set a class as being experimental"""
@@ -34,7 +36,7 @@ class HERDManager:
     This class manages whether to set/attach an instance of HERD to the subclass.
     """
 
-    @docval({'name': 'herd', 'type': 'hdmf.common.resources.HERD',
+    @docval({'name': 'herd', 'type': 'HERD',
              'doc': 'The external resources to be used for the container.'},)
     def link_resources(self, **kwargs):
         """
@@ -75,7 +77,6 @@ def _setter(cls, field):
         Make a setter function for creating a :py:func:`property`
         """
         name = field['name']
-
         if not field.get('settable', True):
             return None
 
@@ -85,10 +86,82 @@ def setter(self, val):
             if name in self.fields:
                 msg = "can't set attribute '%s' -- already set" % name
                 raise AttributeError(msg)
-            self.fields[name] = val
+            self.fields[name] = self._field_config(arg_name=name, val=val)
 
         return setter
 
+    @property
+    def data_type(self):
+        """
+        Return the spec data type associated with this container.
+        """
+        return getattr(self, self._data_type_attr)
+
+
+    def _field_config(self, arg_name, val):
+        """
+        This method will be called in the setter. The termset configuration will be used (if loaded)
+        to check for a defined TermSet associated with the field. If found, the value of the field
+        will be wrapped with a TermSetWrapper.
+
+        Even though the path field in the configurator can be a list of paths, the config
+        itself is only one file. When a user loads custom configs, the config is appended/modified.
+        The modifications are not written to file, avoiding permanent modifications.
+        """
+        # load termset configuration file from global Config
+        from hdmf.common import get_type_map # circular import
+        type_map = get_type_map()
+        configurator = type_map.type_config
+
+        if len(configurator.path)>0:
+            # The type_map has a config always set; however, when toggled off, the config path is empty.
+            CUR_DIR = os.path.dirname(os.path.realpath(configurator.path[0]))
+            termset_config = configurator.config
+        else:
+            return val
+        # check to see that the namespace for the container is in the config
+        if self.namespace not in type_map.container_types:
+            msg = "%s not found within loaded configuration." % self.namespace
+            warn(msg)
+            return val
+        else:
+            # check to see that the container type is in the config under the namespace
+            config_namespace = termset_config['namespaces'][self.namespace]
+            data_type = self.data_type
+
+            if data_type not in config_namespace['data_types']:
+                msg = '%s not found within the configuration for %s' % (data_type, self.namespace)
+                warn(msg)
+                return val
+            else:
+                for attr in config_namespace['data_types'][data_type]:
+                    obj_mapper = type_map.get_map(self)
+
+                    # get the spec according to attr name in schema
+                    # Note: this is the name for the field in the config
+                    spec = obj_mapper.get_attr_spec(attr)
+
+                    # In the case of dealing with datasets directly or not defined in the spec.
+                    # (Data/VectorData/DynamicTable/etc)
+                    if spec is None:
+                        msg = "Spec not found for %s." % attr
+                        warn(msg)
+                        return val
+                    else:
+                        # If the val has been manually wrapped then skip checking the config for the attr
+                        if isinstance(val, TermSetWrapper):
+                            msg = "Field value already wrapped with TermSetWrapper."
+                            warn(msg)
+                            return val
+                        else:
+                            # From the spec, get the mapped attribute name
+                            mapped_attr_name = obj_mapper.get_attribute(spec)
+                            termset_path = os.path.join(CUR_DIR,
+                                                        config_namespace['data_types'][data_type][mapped_attr_name]['termset'])
+                            termset = TermSet(term_schema_path=termset_path)
+                            val = TermSetWrapper(value=val, termset=termset)
+                            return val
+
     @classmethod
     def _getter(cls, field):
         """
@@ -389,7 +462,7 @@ def set_modified(self, **kwargs):
     def children(self):
         return tuple(self.__children)
 
-    @docval({'name': 'child', 'type': 'hdmf.container.Container',
+    @docval({'name': 'child', 'type': 'Container',
              'doc': 'the child Container for this Container', 'default': None})
     def add_child(self, **kwargs):
         warn(DeprecationWarning('add_child is deprecated. Set the parent attribute instead.'))
@@ -787,7 +860,6 @@ class Data(AbstractContainer):
     """
     A class for representing dataset containers
     """
-
     @docval({'name': 'name', 'type': str, 'doc': 'the name of this container'},
             {'name': 'data', 'type': ('scalar_data', 'array_data', 'data'), 'doc': 'the source of the data'})
     def __init__(self, **kwargs):
diff --git a/src/hdmf/term_set.py b/src/hdmf/term_set.py
index f7169bdfd..1464f505c 100644
--- a/src/hdmf/term_set.py
+++ b/src/hdmf/term_set.py
@@ -5,6 +5,7 @@
 import warnings
 import numpy as np
 from .data_utils import append_data, extend_data
+from ruamel.yaml import YAML
 
 
 class TermSet:
@@ -162,12 +163,12 @@ def __schemasheets_convert(self):
         This method returns a path to the new schema to be viewed via SchemaView.
         """
         try:
-            import yaml
             from linkml_runtime.utils.schema_as_dict import schema_as_dict
             from schemasheets.schemamaker import SchemaMaker
         except ImportError:   # pragma: no cover
             msg = "Install schemasheets."
             raise ValueError(msg)
+
         schema_maker = SchemaMaker()
         tsv_file_paths = glob.glob(self.schemasheets_folder + "/*.tsv")
         schema = schema_maker.create_schema(tsv_file_paths)
@@ -175,6 +176,7 @@ def __schemasheets_convert(self):
         schemasheet_schema_path = os.path.join(self.schemasheets_folder, f"{schema_dict['name']}.yaml")
 
         with open(schemasheet_schema_path, "w") as f:
+            yaml=YAML(typ='safe')
             yaml.dump(schema_dict, f)
 
         return schemasheet_schema_path
@@ -262,13 +264,6 @@ def __getitem__(self, val):
         """
         return self.__value[val]
 
-    # uncomment when DataChunkIterator objects can be wrapped by TermSet
-    # def __next__(self):
-    #     """
-    #     Return the next item of a wrapped iterator.
-    #     """
-    #     return self.__value.__next__()
-    #
     def __len__(self):
         return len(self.__value)
 
@@ -304,3 +299,79 @@ def extend(self, arg):
         else:
             msg = ('"%s" is not in the term set.' % ', '.join([str(item) for item in bad_data]))
             raise ValueError(msg)
+
+class TypeConfigurator:
+    """
+    This class allows users to toggle on/off a global configuration for defined data types.
+    When toggled on, every instance of a configuration file supported data type will be validated
+    according to the corresponding TermSet.
+    """
+    @docval({'name': 'path', 'type': str, 'doc': 'Path to the configuration file.', 'default': None})
+    def __init__(self, **kwargs):
+        self.config = None
+        if kwargs['path'] is None:
+            self.path = []
+        else:
+            self.path = [kwargs['path']]
+            self.load_type_config(config_path=self.path[0])
+
+    @docval({'name': 'data_type', 'type': str,
+             'doc': 'The desired data type within the configuration file.'},
+            {'name': 'namespace', 'type': str,
+             'doc': 'The namespace for the data type.'})
+    def get_config(self, data_type, namespace):
+        """
+        Return the config for that data type in the given namespace.
+        """
+        try:
+            namespace_config = self.config['namespaces'][namespace]
+        except KeyError:
+            msg = 'The namespace %s was not found within the configuration.' % namespace
+            raise ValueError(msg)
+
+        try:
+            type_config = namespace_config['data_types'][data_type]
+            return type_config
+        except KeyError:
+            msg = '%s was not found within the configuration for that namespace.' % data_type
+            raise ValueError(msg)
+
+    @docval({'name': 'config_path', 'type': str, 'doc': 'Path to the configuration file.'})
+    def load_type_config(self,config_path):
+        """
+        Load the configuration file for validation on the fields defined for the objects within the file.
+        """
+        with open(config_path, 'r') as config:
+            yaml=YAML(typ='safe')
+            termset_config = yaml.load(config)
+            if self.config is None: # set the initial config/load after config has been unloaded
+                self.config = termset_config
+                if len(self.path)==0: # for loading after an unloaded config
+                    self.path.append(config_path)
+            else: # append/replace to the existing config
+                if config_path in self.path:
+                    msg = 'This configuration file path already exists within the configurator.'
+                    raise ValueError(msg)
+                else:
+                    for namespace in termset_config['namespaces']:
+                        if namespace not in self.config['namespaces']: # append namespace config if not present
+                            self.config['namespaces'][namespace] = termset_config['namespaces'][namespace]
+                        else: # check for any needed overrides within existing namespace configs
+                            for data_type in termset_config['namespaces'][namespace]['data_types']:
+                                # NOTE: these two branches effectively do the same thing, but are split for clarity.
+                                if data_type in self.config['namespaces'][namespace]['data_types']:
+                                    replace_config = termset_config['namespaces'][namespace]['data_types'][data_type]
+                                    self.config['namespaces'][namespace]['data_types'][data_type] = replace_config
+                                else: # append to config
+                                    new_config = termset_config['namespaces'][namespace]['data_types'][data_type]
+                                    self.config['namespaces'][namespace]['data_types'][data_type] = new_config
+
+                    # append path to self.path
+                    self.path.append(config_path)
+
+    def unload_type_config(self):
+        """
+        Remove validation according to termset configuration file.
+        """
+        self.path = []
+        self.config = None
diff --git a/tests/unit/common/test_common.py b/tests/unit/common/test_common.py
index 76c99d44a..e20614852 100644
--- a/tests/unit/common/test_common.py
+++ b/tests/unit/common/test_common.py
@@ -1,5 +1,5 @@
 from hdmf import Data, Container
-from hdmf.common import get_type_map
+from hdmf.common import get_type_map, load_type_config, unload_type_config
 from hdmf.testing import TestCase
 
 
@@ -11,3 +11,15 @@ def test_base_types(self):
         self.assertIs(cls, Container)
         cls = tm.get_dt_container_cls('Data', 'hdmf-common')
         self.assertIs(cls, Data)
+
+    def test_copy_ts_config(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        load_type_config(config_path=path)
+        tm = get_type_map()
+        config = {'namespaces': {'hdmf-common': {'version': '3.12.2',
+                  'data_types': {'VectorData': {'description': {'termset': 'example_test_term_set.yaml'}},
+                                 'VectorIndex': {'data': '...'}}}}}
+
+        self.assertEqual(tm.type_config.config, config)
+        self.assertEqual(tm.type_config.path, [path])
+        unload_type_config()
diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py
index d98add060..f2d03332f 100644
--- a/tests/unit/common/test_table.py
+++ b/tests/unit/common/test_table.py
@@ -17,8 +17,7 @@
     EnumData,
     DynamicTableRegion,
     get_manager,
-    SimpleMultiContainer,
-)
+    SimpleMultiContainer)
 from hdmf.testing import TestCase, H5RoundTripMixin, remove_test_file
 from hdmf.utils import StrDataset
 from hdmf.data_utils import DataChunkIterator
@@ -32,9 +31,9 @@
 
 try:
     import linkml_runtime  # noqa: F401
-    LINKML_INSTALLED = True
+    REQUIREMENTS_INSTALLED = True
 except ImportError:
-    LINKML_INSTALLED = False
+    REQUIREMENTS_INSTALLED = False
 
 
 class TestDynamicTable(TestCase):
@@ -131,7 +130,7 @@ def test_constructor_all_columns_are_iterators(self):
         # now test that when we supply id's that the error goes away
         _ = DynamicTable(name="TestTable", description="", columns=[column], id=list(range(3)))
 
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
     def test_add_col_validate(self):
         terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
         col1 = VectorData(
@@ -150,7 +149,7 @@ def test_add_col_validate(self):
         expected_df.index.name = 'id'
         pd.testing.assert_frame_equal(species.to_dataframe(), expected_df)
 
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
     def test_add_col_validate_bad_data(self):
         terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
         col1 = VectorData(
@@ -165,7 +164,7 @@ def test_add_col_validate_bad_data(self):
                                data=TermSetWrapper(value=['bad data'],
                                                    termset=terms))
 
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
     def test_add_row_validate(self):
         terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
         col1 = VectorData(
@@ -187,7 +186,7 @@ def test_add_row_validate(self):
         expected_df.index.name = 'id'
         pd.testing.assert_frame_equal(species.to_dataframe(), expected_df)
 
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
     def test_add_row_validate_bad_data_one_col(self):
         terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
         col1 = VectorData(
@@ -204,7 +203,7 @@ def test_add_row_validate_bad_data_one_col(self):
         with self.assertRaises(ValueError):
             species.add_row(Species_1='bad', Species_2='Ursus arctos horribilis')
 
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
     def test_add_row_validate_bad_data_all_col(self):
         terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
         col1 = VectorData(
diff --git a/tests/unit/hdmf_config.yaml b/tests/unit/hdmf_config.yaml
new file mode 100644
index 000000000..92ec2f321
--- /dev/null
+++ b/tests/unit/hdmf_config.yaml
@@ -0,0 +1,9 @@
+namespaces:
+  hdmf-common:
+    version: 3.12.2
+    data_types:
+      VectorData:
+        description:
+          termset: example_test_term_set.yaml
+      VectorIndex:
+        data: ...
diff --git a/tests/unit/hdmf_config2.yaml b/tests/unit/hdmf_config2.yaml
new file mode 100644
index 000000000..0aecacf51
--- /dev/null
+++ b/tests/unit/hdmf_config2.yaml
@@ -0,0 +1,18 @@
+namespaces:
+  hdmf-common:
+    version: 3.12.2
+    data_types:
+      Data:
+        description:
+          termset: example_test_term_set.yaml
+      EnumData:
+        description:
+          termset: example_test_term_set.yaml
+      VectorData:
+        description: ...
+  namespace2:
+    version: 0
+    data_types:
+      MythicData:
+        description:
+          termset: example_test_term_set.yaml
diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py
index b5a2d87e8..9ac81ba13 100644
--- a/tests/unit/test_container.py
+++ b/tests/unit/test_container.py
@@ -58,6 +58,11 @@ def test_new(self):
         self.assertFalse(child_obj._in_construct_mode)
         self.assertTrue(child_obj.modified)
 
+    def test_get_data_type(self):
+        obj = Container('obj1')
+        dt = obj.data_type
+        self.assertEqual(dt, 'Container')
+
     def test_new_object_id_none(self):
         """Test that passing object_id=None to __new__ is OK and results in a non-None object ID being assigned.
         """
@@ -519,7 +524,7 @@ class EmptyFields(AbstractContainer):
         self.assertTupleEqual(EmptyFields.get_fields_conf(), tuple())
 
         props = TestAbstractContainerFieldsConf.find_all_properties(EmptyFields)
-        expected = ['all_objects', 'children', 'container_source', 'fields', 'modified',
+        expected = ['all_objects', 'children', 'container_source', 'data_type', 'fields', 'modified',
                     'name', 'object_id', 'parent', 'read_io']
         self.assertListEqual(props, expected)
 
@@ -540,8 +545,8 @@ def __init__(self, **kwargs):
         self.assertTupleEqual(NamedFields.get_fields_conf(), expected)
 
         props = TestAbstractContainerFieldsConf.find_all_properties(NamedFields)
-        expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
-                    'fields', 'modified', 'name', 'object_id',
+        expected = ['all_objects', 'children', 'container_source', 'data_type',
+                    'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
                     'parent', 'read_io']
         self.assertListEqual(props, expected)
 
@@ -622,8 +627,8 @@ class NamedFieldsChild(NamedFields):
         self.assertTupleEqual(NamedFieldsChild.get_fields_conf(), expected)
 
         props = TestAbstractContainerFieldsConf.find_all_properties(NamedFieldsChild)
-        expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
-                    'fields', 'modified', 'name', 'object_id',
+        expected = ['all_objects', 'children', 'container_source', 'data_type',
+                    'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
                     'parent', 'read_io']
         self.assertListEqual(props, expected)
 
diff --git a/tests/unit/test_term_set.py b/tests/unit/test_term_set.py
index b4a469438..99bd6bf59 100644
--- a/tests/unit/test_term_set.py
+++ b/tests/unit/test_term_set.py
@@ -1,9 +1,12 @@
 import os
+import numpy as np
 
-from hdmf.term_set import TermSet, TermSetWrapper
+from hdmf import Container
+from hdmf.term_set import TermSet, TermSetWrapper, TypeConfigurator
 from hdmf.testing import TestCase, remove_test_file
-from hdmf.common import VectorData
-import numpy as np
+from hdmf.common import (VectorIndex, VectorData, unload_type_config,
+                         get_loaded_type_config, load_type_config)
+from hdmf.utils import popargs
 
 
 CUR_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -215,3 +218,118 @@ def test_wrapper_extend_error(self):
         data_obj = VectorData(name='species', description='...', data=self.wrapped_list)
         with self.assertRaises(ValueError):
             data_obj.extend(['bad_data'])
+
+class TestTypeConfig(TestCase):
+    def setUp(self):
+        if not REQUIREMENTS_INSTALLED:
+            self.skipTest("optional LinkML module is not installed")
+
+    def tearDown(self):
+        unload_type_config()
+
+    def test_get_loaded_type_config_error(self):
+        with self.assertRaises(ValueError):
+            get_loaded_type_config()
+
+    def test_config_path(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        tc = TypeConfigurator(path=path)
+        self.assertEqual(tc.path, [path])
+
+    def test_get_config(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        tc = TypeConfigurator(path=path)
+        self.assertEqual(tc.get_config('VectorData', 'hdmf-common'),
+                                      {'description': {'termset': 'example_test_term_set.yaml'}})
+
+    def test_get_config_namespace_error(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        tc = TypeConfigurator(path=path)
+        with self.assertRaises(ValueError):
+            tc.get_config('VectorData', 'hdmf-common11')
+
+    def test_get_config_container_error(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        tc = TypeConfigurator(path=path)
+        with self.assertRaises(ValueError):
+            tc.get_config('VectorData11', 'hdmf-common')
+
+    def test_already_loaded_path_error(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        tc = TypeConfigurator(path=path)
+        with self.assertRaises(ValueError):
+            tc.load_type_config(config_path=path)
+
+    def test_load_two_unique_configs(self):
+        path = 'tests/unit/hdmf_config.yaml'
+        path2 = 'tests/unit/hdmf_config2.yaml'
+        tc = TypeConfigurator(path=path)
+        tc.load_type_config(config_path=path2)
+        config = {'namespaces': {'hdmf-common': {'version': '3.12.2',
+                  'data_types': {'VectorData': {'description': '...'},
+                                 'VectorIndex': {'data': '...'},
+                                 'Data': {'description': {'termset': 'example_test_term_set.yaml'}},
+                                 'EnumData': {'description': {'termset': 'example_test_term_set.yaml'}}}},
+                  'namespace2': {'version': 0,
+                  'data_types': {'MythicData': {'description': {'termset': 'example_test_term_set.yaml'}}}}}}
+        self.assertEqual(tc.path, [path, path2])
+        self.assertEqual(tc.config, config)
+
+
+class ExtensionContainer(Container):
+    __fields__ = ("description",)
+
+    def __init__(self, **kwargs):
+        description, namespace = popargs('description', 'namespace', kwargs)
+        self.namespace = namespace
+        super().__init__(**kwargs)
+        self.description = description
+
+
+class TestGlobalTypeConfig(TestCase):
+    def setUp(self):
+        if not REQUIREMENTS_INSTALLED:
+            self.skipTest("optional LinkML module is not installed")
+        load_type_config(config_path='tests/unit/hdmf_config.yaml')
+
+    def tearDown(self):
+        unload_type_config()
+
+    def test_load_config(self):
+        config = get_loaded_type_config()
+        self.assertEqual(config,
+        {'namespaces': {'hdmf-common': {'version': '3.12.2',
+        'data_types': {'VectorData': {'description': {'termset': 'example_test_term_set.yaml'}},
+                       'VectorIndex': {'data': '...'}}}}})
+
+    def test_validate_with_config(self):
+        data = VectorData(name='foo', data=[0], description='Homo sapiens')
+        self.assertEqual(data.description.value, 'Homo sapiens')
+
+    def test_namespace_warn(self):
+        with self.assertWarns(Warning):
+            ExtensionContainer(name='foo',
+                               namespace='foo',
+                               description='Homo sapiens')
+
+    def test_container_type_warn(self):
+        with self.assertWarns(Warning):
+            ExtensionContainer(name='foo',
+                               namespace='hdmf-common',
+                               description='Homo sapiens')
+
+    def test_already_wrapped_warn(self):
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        with self.assertWarns(Warning):
+            VectorData(name='foo',
+                       data=[0],
+                       description=TermSetWrapper(value='Homo sapiens', termset=terms))
+
+    def test_warn_field_not_in_spec(self):
+        col1 = VectorData(name='col1',
+                                  description='Homo sapiens',
+                                  data=['1a', '1b', '1c', '2a'])
+        with self.assertWarns(Warning):
+            VectorIndex(name='col1_index',
+                        target=col1,
+                        data=[3, 4])
diff --git a/tests/unit/test_term_set_input/schemasheets/nwb_static_enums.yaml b/tests/unit/test_term_set_input/schemasheets/nwb_static_enums.yaml
index 222205959..52af4b8a7 100644
--- a/tests/unit/test_term_set_input/schemasheets/nwb_static_enums.yaml
+++ b/tests/unit/test_term_set_input/schemasheets/nwb_static_enums.yaml
@@ -2,8 +2,7 @@ classes:
   BrainSample:
     slot_usage:
       cell_type: {}
-    slots:
-    - cell_type
+    slots: [cell_type]
 default_prefix: TEMP
 default_range: string
 description: this schema demonstrates the use of static enums
@@ -11,42 +10,27 @@ enums:
   NeuronOrGlialCellTypeEnum:
     description: Enumeration to capture various cell types found in the brain.
     permissible_values:
-      ASTROCYTE:
-        description: Characteristic star-shaped glial cells in the brain and spinal
-          cord.
-        meaning: CL:0000127
-      INTERNEURON:
-        description: Neurons whose axons (and dendrites) are limited to a single brain
-          area.
-        meaning: CL:0000099
-      MICROGLIAL_CELL:
-        description: Microglia are the resident immune cells of the brain and constantly
-          patrol the cerebral microenvironment to respond to pathogens and damage.
-        meaning: CL:0000129
-      MOTOR_NEURON:
-        description: Neurons whose cell body is located in the motor cortex, brainstem
-          or the spinal cord, and whose axon (fiber) projects to the spinal cord or
-          outside of the spinal cord to directly or indirectly control effector organs,
-          mainly muscles and glands.
-        meaning: CL:0000100
-      OLIGODENDROCYTE:
-        description: Type of neuroglia whose main functions are to provide support
-          and insulation to axons within the central nervous system (CNS) of jawed
-          vertebrates.
-        meaning: CL:0000128
-      PYRAMIDAL_NEURON:
-        description: Neurons with a pyramidal shaped cell body (soma) and two distinct
-          dendritic trees.
-        meaning: CL:0000598
+      ASTROCYTE: {description: Characteristic star-shaped glial cells in the brain
+          and spinal cord., meaning: 'CL:0000127'}
+      INTERNEURON: {description: Neurons whose axons (and dendrites) are limited to
+          a single brain area., meaning: 'CL:0000099'}
+      MICROGLIAL_CELL: {description: Microglia are the resident immune cells of the
+          brain and constantly patrol the cerebral microenvironment to respond to
+          pathogens and damage., meaning: 'CL:0000129'}
+      MOTOR_NEURON: {description: 'Neurons whose cell body is located in the motor
+          cortex, brainstem or the spinal cord, and whose axon (fiber) projects to
+          the spinal cord or outside of the spinal cord to directly or indirectly
+          control effector organs, mainly muscles and glands.', meaning: 'CL:0000100'}
+      OLIGODENDROCYTE: {description: Type of neuroglia whose main functions are to
+          provide support and insulation to axons within the central nervous system
+          (CNS) of jawed vertebrates., meaning: 'CL:0000128'}
+      PYRAMIDAL_NEURON: {description: Neurons with a pyramidal shaped cell body (soma)
+          and two distinct dendritic trees., meaning: 'CL:0000598'}
 id: https://w3id.org/linkml/examples/nwb_static_enums
-imports:
-- linkml:types
+imports: ['linkml:types']
 name: nwb_static_enums
-prefixes:
-  CL: http://purl.obolibrary.org/obo/CL_
-  TEMP: https://example.org/TEMP/
-  linkml: https://w3id.org/linkml/
+prefixes: {CL: 'http://purl.obolibrary.org/obo/CL_', TEMP: 'https://example.org/TEMP/',
+  linkml: 'https://w3id.org/linkml/'}
 slots:
-  cell_type:
-    required: true
+  cell_type: {required: true}
 title: static enums example

From d85d0cbc36d2e0fdb25e8fbea14d58ba7bf24a40 Mon Sep 17 00:00:00 2001
From: Matthew Avaylon <mavaylon1@berkeley.edu>
Date: Thu, 4 Apr 2024 00:57:28 -0700
Subject: [PATCH 13/13] Compound Dataset Support for TermSetWrapper (#1061)

* concept

* ruff

* test

* clean up

* doc

* ruff

* Update CHANGELOG.md

* Update term_set.py

* Update term_set.py

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update docs/gallery/plot_term_set.py

* Update src/hdmf/term_set.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* checkpoint

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* tests

* tests

* test

* Update plot_term_set.py

* Update term_set.py

* Update tests/unit/common/test_table.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update tests/unit/common/test_table.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* data

* data

* method

* tests

* tests

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update data_utils.py

* Update data_utils.py

* Update data_utils.py

* Update test_table.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update data_utils.py

* Update data_utils.py

* test

---------

Co-authored-by: Ryan Ly <rly@lbl.gov>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                    | 27 +++++-----
 docs/gallery/plot_term_set.py   | 14 +++++
 src/hdmf/data_utils.py          |  5 +-
 src/hdmf/term_set.py            | 64 +++++++++++++++++-----
 tests/unit/common/test_table.py | 95 +++++++++++++++++++++++++++++++++
 tests/unit/test_term_set.py     | 11 ++--
 6 files changed, 185 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 22c21b0e8..e72015601 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Enhancements
 - Added `TermSetConfigurator` to automatically wrap fields with `TermSetWrapper` according to a configuration file. @mavaylon1 [#1016](https://github.com/hdmf-dev/hdmf/pull/1016)
+- Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061)
 
 ## HDMF 3.13.0 (March 20, 2024)
 
@@ -138,8 +139,8 @@ will increase the minor version number to 3.10.0. See the 3.9.1 release notes be
 ## HDMF 3.6.0 (May 12, 2023)
 
 ### New features and minor improvements
-- Updated `ExternalResources` to have `FileTable` and new methods to query data. the `ResourceTable` has been removed along with methods relating to `Resource`. @mavaylon [#850](https://github.com/hdmf-dev/hdmf/pull/850)
-- Updated hdmf-common-schema version to 1.6.0. @mavaylon [#850](https://github.com/hdmf-dev/hdmf/pull/850)
+- Updated `ExternalResources` to have `FileTable` and new methods to query data. the `ResourceTable` has been removed along with methods relating to `Resource`. @mavaylon1 [#850](https://github.com/hdmf-dev/hdmf/pull/850)
+- Updated hdmf-common-schema version to 1.6.0. @mavaylon1 [#850](https://github.com/hdmf-dev/hdmf/pull/850)
 - Added testing of HDMF-Zarr on PR and nightly. @rly [#859](https://github.com/hdmf-dev/hdmf/pull/859)
 - Replaced `setup.py` with `pyproject.toml`. @rly [#844](https://github.com/hdmf-dev/hdmf/pull/844)
 - Use `ruff` instead of `flake8`. @rly [#844](https://github.com/hdmf-dev/hdmf/pull/844)
@@ -153,7 +154,7 @@ will increase the minor version number to 3.10.0. See the 3.9.1 release notes be
   [#853](https://github.com/hdmf-dev/hdmf/pull/853)
 
 ### Documentation and tutorial enhancements:
-- Updated `ExternalResources` how to tutorial to include the new features. @mavaylon [#850](https://github.com/hdmf-dev/hdmf/pull/850)
+- Updated `ExternalResources` how to tutorial to include the new features. @mavaylon1 [#850](https://github.com/hdmf-dev/hdmf/pull/850)
 
 ## HDMF 3.5.6 (April 28, 2023)
 
@@ -193,13 +194,13 @@ will increase the minor version number to 3.10.0. See the 3.9.1 release notes be
 
 ### Bug fixes
 - Fixed issue with conda CI. @rly [#823](https://github.com/hdmf-dev/hdmf/pull/823)
-- Fixed issue with deprecated `pkg_resources`. @mavaylon [#822](https://github.com/hdmf-dev/hdmf/pull/822)
-- Fixed `hdmf.common` deprecation warning. @mavaylon [#826]((https://github.com/hdmf-dev/hdmf/pull/826)
+- Fixed issue with deprecated `pkg_resources`. @mavaylon1 [#822](https://github.com/hdmf-dev/hdmf/pull/822)
+- Fixed `hdmf.common` deprecation warning. @mavaylon1 [#826]((https://github.com/hdmf-dev/hdmf/pull/826)
 
 ### Internal improvements
 - A number of typos fixed and Github action running codespell to ensure that no typo sneaks in [#825](https://github.com/hdmf-dev/hdmf/pull/825) was added.
-- Added additional documentation for `__fields__` in `AbstactContainer`. @mavaylon [#827](https://github.com/hdmf-dev/hdmf/pull/827)
-- Updated warning message for broken links. @mavaylon [#829](https://github.com/hdmf-dev/hdmf/pull/829)
+- Added additional documentation for `__fields__` in `AbstactContainer`. @mavaylon1 [#827](https://github.com/hdmf-dev/hdmf/pull/827)
+- Updated warning message for broken links. @mavaylon1 [#829](https://github.com/hdmf-dev/hdmf/pull/829)
 
 ## HDMF 3.5.1 (January 26, 2023)
 
@@ -218,9 +219,9 @@ will increase the minor version number to 3.10.0. See the 3.9.1 release notes be
 - Added ``HDMFIO.__del__`` to ensure that I/O objects are being closed on delete. @oruebel[#811](https://github.com/hdmf-dev/hdmf/pull/811)
 
 ### Minor improvements
-- Added support for reading and writing `ExternalResources` to and from denormalized TSV files. @mavaylon [#799](https://github.com/hdmf-dev/hdmf/pull/799)
-- Changed the name of `ExternalResources.export_to_sqlite` to `ExternalResources.to_sqlite`. @mavaylon [#799](https://github.com/hdmf-dev/hdmf/pull/799)
-- Updated the tutorial for `ExternalResources`. @mavaylon [#799](https://github.com/hdmf-dev/hdmf/pull/799)
+- Added support for reading and writing `ExternalResources` to and from denormalized TSV files. @mavaylon1 [#799](https://github.com/hdmf-dev/hdmf/pull/799)
+- Changed the name of `ExternalResources.export_to_sqlite` to `ExternalResources.to_sqlite`. @mavaylon1 [#799](https://github.com/hdmf-dev/hdmf/pull/799)
+- Updated the tutorial for `ExternalResources`. @mavaylon1 [#799](https://github.com/hdmf-dev/hdmf/pull/799)
 - Added `message` argument for assert methods defined by `hdmf.testing.TestCase` to allow developers to include custom error messages with asserts. @oruebel [#812](https://github.com/hdmf-dev/hdmf/pull/812)
 - Clarify the expected chunk shape behavior for `DataChunkIterator`. @oruebel [#813](https://github.com/hdmf-dev/hdmf/pull/813)
 
@@ -361,7 +362,7 @@ the fields (i.e., when the constructor sets some fields to fixed values). @rly
 - Plotted results in external resources tutorial. @oruebel (#667)
 - Added support for Python 3.10. @rly (#679)
 - Updated requirements. @rly @TheChymera (#681)
-- Improved testing for `ExternalResources`. @mavaylon (#673)
+- Improved testing for `ExternalResources`. @mavaylon1 (#673)
 - Improved docs for export. @rly (#674)
 - Enhanced data chunk iteration speeds through new ``GenericDataChunkIterator`` class.  @CodyCBakerPhD (#672)
 - Enhanced issue template forms on GitHub. @CodyCBakerPHD (#700)
@@ -437,7 +438,7 @@ the fields (i.e., when the constructor sets some fields to fixed values). @rly
 - Allow passing ``index=True`` to ``DynamicTable.to_dataframe()`` to support returning `DynamicTableRegion` columns
   as indices or Pandas DataFrame. @rly (#579)
 - Improve ``DynamicTable`` documentation. @rly (#639)
-- Updated external resources tutorial. @mavaylon (#611)
+- Updated external resources tutorial. @mavaylon1 (#611)
 
 ### Breaking changes and deprecations
 - Previously, when using ``DynamicTable.__getitem__`` or ``DynamicTable.get`` to access a selection of a
@@ -522,7 +523,7 @@ the fields (i.e., when the constructor sets some fields to fixed values). @rly
   - Add experimental namespace to HDMF common schema. New data types should go in the experimental namespace
     (hdmf-experimental) prior to being added to the core (hdmf-common) namespace. The purpose of this is to provide
     a place to test new data types that may break backward compatibility as they are refined. @ajtritt (#545)
-  - `ExternalResources` was changed to support storing both names and URIs for resources. @mavaylon (#517, #548)
+  - `ExternalResources` was changed to support storing both names and URIs for resources. @mavaylon1 (#517, #548)
   - The `VocabData` data type was replaced by `EnumData` to provide more flexible support for data from a set of
     fixed values.
   - Added `AlignedDynamicTable`, which defines a `DynamicTable` that supports storing a collection of sub-tables.
diff --git a/docs/gallery/plot_term_set.py b/docs/gallery/plot_term_set.py
index c1f7c7257..8bf2375aa 100644
--- a/docs/gallery/plot_term_set.py
+++ b/docs/gallery/plot_term_set.py
@@ -67,6 +67,7 @@
 """
 from hdmf.common import DynamicTable, VectorData
 import os
+import numpy as np
 
 try:
     import linkml_runtime  # noqa: F401
@@ -129,6 +130,19 @@
     data=TermSetWrapper(value=['Homo sapiens'], termset=terms)
     )
 
+######################################################
+# Validate Compound Data with TermSetWrapper
+# ----------------------------------------------------
+# :py:class:`~hdmf.term_set.TermSetWrapper` can be wrapped around compound data.
+# The user will set the field within the compound data type that is to be validated
+# with the termset.
+c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+data = VectorData(
+    name='species',
+    description='...',
+    data=TermSetWrapper(value=c_data, termset=terms, field='species')
+)
+
 ######################################################
 # Validate Attributes with TermSetWrapper
 # ----------------------------------------------------
diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py
index 2df66106d..23f0b4019 100644
--- a/src/hdmf/data_utils.py
+++ b/src/hdmf/data_utils.py
@@ -20,7 +20,10 @@ def append_data(data, arg):
         data.append(arg)
         return data
     elif isinstance(data, np.ndarray):
-        return np.append(data,  np.expand_dims(arg, axis=0), axis=0)
+        if len(data.dtype)>0: # data is a structured array
+            return np.append(data, arg)
+        else: # arg is a scalar or row vector
+            return np.append(data,  np.expand_dims(arg, axis=0), axis=0)
     elif isinstance(data, h5py.Dataset):
         shape = list(data.shape)
         shape[0] += 1
diff --git a/src/hdmf/term_set.py b/src/hdmf/term_set.py
index 1464f505c..0f42819b0 100644
--- a/src/hdmf/term_set.py
+++ b/src/hdmf/term_set.py
@@ -216,19 +216,26 @@ class TermSetWrapper:
             {'name': 'value',
              'type': (list, np.ndarray, dict, str, tuple),
              'doc': 'The target item that is wrapped, either data or attribute.'},
+            {'name': 'field', 'type': str, 'default': None,
+             'doc': 'The field within a compound array.'}
             )
     def __init__(self, **kwargs):
         self.__value = kwargs['value']
         self.__termset = kwargs['termset']
+        self.__field = kwargs['field']
         self.__validate()
 
     def __validate(self):
-        # check if list, tuple, array
-        if isinstance(self.__value, (list, np.ndarray, tuple)): # TODO: Future ticket on DataIO support
-            values = self.__value
-        # create list if none of those -> mostly for attributes
+        if self.__field is not None:
+            values = self.__value[self.__field]
         else:
-            values = [self.__value]
+            # check if list, tuple, array
+            if isinstance(self.__value, (list, np.ndarray, tuple)):
+                values = self.__value
+            # create list if none of those -> mostly for scalar attributes
+            else:
+                values = [self.__value]
+
         # iteratively validate
         bad_values = []
         for term in values:
@@ -243,6 +250,10 @@ def __validate(self):
     def value(self):
         return self.__value
 
+    @property
+    def field(self):
+        return self.__field
+
     @property
     def termset(self):
         return self.__termset
@@ -273,26 +284,55 @@ def __iter__(self):
         """
         return self.__value.__iter__()
 
+    def __multi_validation(self, data):
+        """
+        append_data includes numpy arrays. This is not the same as list append.
+        Numpy array append is essentially list extend. Now if a user appends an array (for compound data), we need to
+        support validating arrays with multiple items. This method is an internal bulk validation
+        check for numpy arrays and extend.
+        """
+        bad_values = []
+        for item in data:
+            if not self.termset.validate(term=item):
+                bad_values.append(item)
+        return bad_values
+
     def append(self, arg):
         """
         This append resolves the wrapper to use the append of the container using
         the wrapper.
         """
-        if self.termset.validate(term=arg):
-            self.__value = append_data(self.__value, arg)
+        if isinstance(arg, np.ndarray):
+            if self.__field is not None: # compound array
+                values = arg[self.__field]
+            else:
+                msg = "Array needs to be a structured array with compound dtype. If this does not apply, use extend."
+                raise ValueError(msg)
         else:
-            msg = ('"%s" is not in the term set.' % arg)
+            values = [arg]
+
+        bad_values = self.__multi_validation(values)
+
+        if len(bad_values)!=0:
+            msg = ('"%s" is not in the term set.' % ', '.join([str(value) for value in bad_values]))
             raise ValueError(msg)
 
+        self.__value = append_data(self.__value, arg)
+
     def extend(self, arg):
         """
         This append resolves the wrapper to use the extend of the container using
         the wrapper.
         """
-        bad_data = []
-        for item in arg:
-            if not self.termset.validate(term=item):
-                bad_data.append(item)
+        if isinstance(arg, np.ndarray):
+            if self.__field is not None: # compound array
+                values = arg[self.__field]
+            else:
+                values = arg
+        else:
+            values = arg
+
+        bad_data = self.__multi_validation(values)
 
         if len(bad_data)==0:
             self.__value = extend_data(self.__value, arg)
diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py
index f2d03332f..00b3c14a3 100644
--- a/tests/unit/common/test_table.py
+++ b/tests/unit/common/test_table.py
@@ -220,6 +220,101 @@ def test_add_row_validate_bad_data_all_col(self):
         with self.assertRaises(ValueError):
             species.add_row(Species_1='bad data', Species_2='bad data')
 
+    def test_compound_data_append(self):
+        c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        c_data2 = np.array([('Mus musculus', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        compound_vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=c_data
+        )
+        compound_vector_data.append(c_data2)
+
+        np.testing.assert_array_equal(compound_vector_data.data, np.append(c_data, c_data2))
+
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
+    def test_array_append_error(self):
+        c_data = np.array(['Homo sapiens'])
+        c_data2 = np.array(['Mus musculus'])
+
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        vectordata_termset = VectorData(
+            name='Species_1',
+            description='...',
+            data=TermSetWrapper(value=c_data, termset=terms)
+        )
+
+        with self.assertRaises(ValueError):
+            vectordata_termset.append(c_data2)
+
+    def test_compound_data_extend(self):
+        c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        c_data2 = np.array([('Mus musculus', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        compound_vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=c_data
+        )
+        compound_vector_data.extend(c_data2)
+
+        np.testing.assert_array_equal(compound_vector_data.data, np.vstack((c_data, c_data2)))
+
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
+    def test_add_ref_wrapped_array_append(self):
+        data = np.array(['Homo sapiens'])
+        data2 = 'Mus musculus'
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=TermSetWrapper(value=data, termset=terms)
+        )
+        vector_data.append(data2)
+
+        np.testing.assert_array_equal(vector_data.data.data, np.append(data, data2))
+
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
+    def test_add_ref_wrapped_array_extend(self):
+        data = np.array(['Homo sapiens'])
+        data2 = np.array(['Mus musculus'])
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=TermSetWrapper(value=data, termset=terms)
+        )
+        vector_data.extend(data2)
+
+        np.testing.assert_array_equal(vector_data.data.data, np.vstack((data, data2)))
+
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
+    def test_add_ref_wrapped_compound_data_append(self):
+        c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        c_data2 = np.array([('Mus musculus', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        compound_vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=TermSetWrapper(value=c_data, field='species', termset=terms)
+        )
+        compound_vector_data.append(c_data2)
+
+        np.testing.assert_array_equal(compound_vector_data.data.data, np.append(c_data, c_data2))
+
+    @unittest.skipIf(not REQUIREMENTS_INSTALLED, "optional LinkML module is not installed")
+    def test_add_ref_wrapped_compound_data_extend(self):
+        c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        c_data2 = np.array([('Mus musculus', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
+        compound_vector_data = VectorData(
+            name='Species_1',
+            description='...',
+            data=TermSetWrapper(value=c_data, field='species', termset=terms)
+        )
+        compound_vector_data.extend(c_data2)
+
+        np.testing.assert_array_equal(compound_vector_data.data.data, np.vstack((c_data, c_data2)))
+
     def test_constructor_bad_columns(self):
         columns = ['bad_column']
         msg = "'columns' must be a list of dict, VectorData, DynamicTableRegion, or VectorIndex"
diff --git a/tests/unit/test_term_set.py b/tests/unit/test_term_set.py
index 99bd6bf59..1d7721f1b 100644
--- a/tests/unit/test_term_set.py
+++ b/tests/unit/test_term_set.py
@@ -155,21 +155,22 @@ def setUp(self):
         self.wrapped_array = TermSetWrapper(value=np.array(['Homo sapiens']), termset=self.termset)
         self.wrapped_list = TermSetWrapper(value=['Homo sapiens'], termset=self.termset)
 
+        c_data = np.array([('Homo sapiens', 24)], dtype=[('species', 'U50'), ('age', 'i4')])
+        self.wrapped_comp_array = TermSetWrapper(value=c_data,
+                                                 termset=self.termset,
+                                                 field='species')
+
         self.np_data = VectorData(
             name='Species_1',
             description='...',
             data=self.wrapped_array
         )
-        self.list_data = VectorData(
-            name='Species_1',
-            description='...',
-            data=self.wrapped_list
-        )
 
     def test_properties(self):
         self.assertEqual(self.wrapped_array.value, ['Homo sapiens'])
         self.assertEqual(self.wrapped_array.termset.view_set, self.termset.view_set)
         self.assertEqual(self.wrapped_array.dtype, 'U12') # this covers __getattr__
+        self.assertEqual(self.wrapped_comp_array.field, 'species')
 
     def test_get_item(self):
         self.assertEqual(self.np_data.data[0], 'Homo sapiens')