Fix incorrect dtype precision upgrade for VectorIndex (#631)

hdmf-dev · Jun 16, 2021 · 929ec93 · 929ec93
1 parent 36db0e8
commit 929ec93
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,14 +1,19 @@
 # HDMF Changelog
 
-## HDMF 2.5.8 (Upcoming)
+## HDMF 2.5.8 (June 16, 2021)
 
 ### Minor improvements
 - Improve Sphinx documentation. @rly (#627)
 
+### Bug fix
+- Fix error with representing an indexed table column when the `VectorIndex` dtype precision is upgraded more
+  than one step, e.g., uint8 to uint32. This can happen when, for example, a single `add_row` call is used to
+  add more than 65535 elements to an empty indexed column. @rly (#631)
+
 ## HDMF 2.5.7 (June 4, 2021)
 
 ### Bug fix
-- Fix generation of extension classes that extend MultiContainerInterface and use a custom _fieldsname. @rly (#626)
+- Fix generation of extension classes that extend `MultiContainerInterface` and use a custom _fieldsname. @rly (#626)
 
 ## HDMF 2.5.6 (May 19, 2021)
 

diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py
@@ -107,28 +107,33 @@ def add_vector(self, arg, **kwargs):
 
     def __check_precision(self, idx):
         """
-        Check precision of current dataset and, if
-        necessary, adjust precision to accommodate new value.
+        Check precision of current dataset and, if necessary, adjust precision to accommodate new value.
 
         Returns:
             unsigned integer encoding of idx
         """
         if idx > self.__maxval:
-            nbits = (np.log2(self.__maxval + 1) * 2)
+            while idx > self.__maxval:
+                nbits = (np.log2(self.__maxval + 1) * 2)  # 8->16, 16->32, 32->64
+                if nbits == 128:  # pragma: no cover
+                    msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype '
+                           'allowed for VectorIndex is uint64.')
+                    raise ValueError(msg)
+                self.__maxval = 2 ** nbits - 1
             self.__uint = np.dtype('uint%d' % nbits).type
-            self.__maxval = 2 ** nbits - 1
             self.__adjust_precision(self.__uint)
         return self.__uint(idx)
 
     def __adjust_precision(self, uint):
         """
-        Adjust precision of data to specificied unsigned integer precision
+        Adjust precision of data to specificied unsigned integer precision.
         """
         if isinstance(self.data, list):
             for i in range(len(self.data)):
                 self.data[i] = uint(self.data[i])
         elif isinstance(self.data, np.ndarray):
-            self._VectorIndex__data = self.data.astype(uint)
+            # use self._Data__data to work around restriction on resetting self.data
+            self._Data__data = self.data.astype(uint)
         else:
             raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint))
 

diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py
@@ -1838,3 +1838,66 @@ def test_dtr_references(self):
                                      'y': [read_group1, read_group2]},
                                     index=pd.Index(data=[102, 103], name='id'))
             pd.testing.assert_frame_equal(ret, expected)
+
+
+class TestVectorIndexDtype(TestCase):
+
+    def set_up_array_index(self):
+        data = VectorData(name='data', description='desc')
+        index = VectorIndex(name='index', data=np.array([]), target=data)
+        return index
+
+    def set_up_list_index(self):
+        data = VectorData(name='data', description='desc')
+        index = VectorIndex(name='index', data=[], target=data)
+        return index
+
+    def test_array_inc_precision(self):
+        index = self.set_up_array_index()
+        index.add_vector(np.empty((255, )))
+        self.assertEqual(index.data[0], 255)
+        self.assertEqual(index.data.dtype, np.uint8)
+
+    def test_array_inc_precision_1step(self):
+        index = self.set_up_array_index()
+        index.add_vector(np.empty((65535, )))
+        self.assertEqual(index.data[0], 65535)
+        self.assertEqual(index.data.dtype, np.uint16)
+
+    def test_array_inc_precision_2steps(self):
+        index = self.set_up_array_index()
+        index.add_vector(np.empty((65536, )))
+        self.assertEqual(index.data[0], 65536)
+        self.assertEqual(index.data.dtype, np.uint32)
+
+    def test_array_prev_data_inc_precision_2steps(self):
+        index = self.set_up_array_index()
+        index.add_vector(np.empty((255, )))  # dtype is still uint8
+        index.add_vector(np.empty((65536, )))
+        self.assertEqual(index.data[0], 255)  # make sure the 255 is upgraded
+        self.assertEqual(index.data.dtype, np.uint32)
+
+    def test_list_inc_precision(self):
+        index = self.set_up_list_index()
+        index.add_vector(list(range(255)))
+        self.assertEqual(index.data[0], 255)
+        self.assertEqual(type(index.data[0]), np.uint8)
+
+    def test_list_inc_precision_1step(self):
+        index = self.set_up_list_index()
+        index.add_vector(list(range(65535)))
+        self.assertEqual(index.data[0], 65535)
+        self.assertEqual(type(index.data[0]), np.uint16)
+
+    def test_list_inc_precision_2steps(self):
+        index = self.set_up_list_index()
+        index.add_vector(list(range(65536)))
+        self.assertEqual(index.data[0], 65536)
+        self.assertEqual(type(index.data[0]), np.uint32)
+
+    def test_list_prev_data_inc_precision_2steps(self):
+        index = self.set_up_list_index()
+        index.add_vector(list(range(255)))
+        index.add_vector(list(range(65536 - 255)))
+        self.assertEqual(index.data[0], 255)  # make sure the 255 is upgraded
+        self.assertEqual(type(index.data[0]), np.uint32)