Merge pull request #14 from scikit-hep/feature-persistence

persistence
scikit-hep · Oct 24, 2018 · d23a327 · d23a327
2 parents dd30cc6 + 141ff9d
commit d23a327
Show file tree

Hide file tree

Showing 25 changed files with 1,958 additions and 867 deletions.
diff --git a/awkward/__init__.py b/awkward/__init__.py
@@ -39,7 +39,9 @@
 
 from awkward.generate import fromiter
 
+from awkward.persist import serialize, deserialize, save, load, tohdf5, fromhdf5
+
 # convenient access to the version number
 from awkward.version import __version__
 
-__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "fromiter", "__version__"]
+__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "fromiter", "serialize", "deserialize", "save", "load", "tohdf5", "fromhdf5", "__version__"]
diff --git a/awkward/array/base.py b/awkward/array/base.py
@@ -30,6 +30,8 @@
 
 import types
 
+import awkward.persist
+import awkward.type
 import awkward.util
 
 class AwkwardArray(awkward.util.NDArrayOperatorsMixin):
@@ -38,6 +40,16 @@ def __array__(self, *args, **kwargs):
         # raise Exception("{0} {1}".format(args, kwargs))
         return awkward.util.numpy.array(self, *args, **kwargs)
 
+    def __getstate__(self):
+        state = {}
+        awkward.persist.serialize(self, state)
+        return state
+
+    def __setstate__(self, state):
+        out = awkward.persist.deserialize(state)
+        self.__dict__.update(out.__dict__)
+        self.__class__ = out.__class__
+
     def __iter__(self):
         for i in range(len(self)):
             yield self[i]
@@ -51,6 +63,18 @@ def __str__(self):
     def __repr__(self):
         return "<{0} {1} at {2:012x}>".format(self.__class__.__name__, str(self), id(self))
 
+    @property
+    def type(self):
+        return awkward.type.ArrayType(*(self._getshape() + (awkward.type._resolve(self._gettype({}), {}),)))
+
+    @property
+    def dtype(self):
+        return self.type.dtype
+
+    @property
+    def shape(self):
+        return self.type.shape
+
     def _try_tolist(self, x):
         try:
             return x.tolist()
@@ -84,9 +108,6 @@ def tolist(self):
                 out.append(self._try_tolist(x))
         return out
 
-    def _valid(self):
-        pass
-
     def valid(self):
         try:
             self._valid()

diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py
@@ -29,6 +29,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import awkward.array.base
+import awkward.persist
 import awkward.type
 import awkward.util
 
@@ -76,6 +77,15 @@ def ones_like(self, **overrides):
         mine = self._mine(overrides)
         return self.copy([awkward.util.numpy.ones_like(x) if isinstance(x, awkward.util.numpy.ndarray) else x.ones_like(**overrides) for x in self._chunks], counts=list(self._counts), **mine)
 
+    def __awkward_persist__(self, ident, fill, **kwargs):
+        self.knowcounts()
+        self._valid()
+        n = self.__class__.__name__
+        return {"id": ident,
+                "call": ["awkward", n],
+                "args": [{"list": [fill(x, n + ".chunk", **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]},
+                         fill(awkward.util.numpy.array([c for c in self._counts if c > 0]), n + ".counts", **kwargs)]}
+
     @property
     def chunks(self):
         return self._chunks
@@ -129,11 +139,11 @@ def knowcounts(self, until=None):
     def knowtype(self, at):
         if not 0 <= at < len(self._chunks):
             raise ValueError("cannot knowtype at chunkid {0} with {1} chunks".format(at, len(self._chunks)))
-        tpe = awkward.type.fromarray(self._chunks[at])
-        if tpe.takes == 0:
+        chunk = self._chunks[at]
+        if len(chunk) == 0:
             self._types[at] = ()
         else:
-            self._types[at] = tpe.to
+            self._types[at] = awkward.type.fromarray(chunk).to
         return self._types[at]
 
     def global2chunkid(self, index, return_normalized=False):
@@ -219,7 +229,7 @@ def local2global(self, index, chunkid):
             else:
                 raise TypeError("local2global requires index and chunkid to be integers or arrays of integers")
 
-    def _type(self):
+    def _gettype(self, seen):
         for tpe in self._types:
             if tpe is not None and tpe is not ():
                 break
@@ -230,24 +240,24 @@ def _type(self):
                     break
             else:
                 tpe = awkward.util.DEFAULTTYPE
-        return awkward.type.ArrayType(len(self), tpe)
 
-    @property
-    def type(self):
-        return self._valid()
+        for i in range(len(self._types)):
+            if self._types[i] is None or self._types[i] is () or self._types[i] is tpe:
+                pass
+            elif self._types[i] == tpe:       # valid if all chunks have the same high-level type
+                self._types[i] = tpe          # once checked, make them identically equal for faster checking next time
+            else:
+                raise TypeError("chunks do not have matching types:\n\n{0}\n\nversus\n\n{1}".format(awkward.type._str(tpe, indent="    "), awkward.type._str(self._types[i], indent="    ")))
+
+        return tpe
+
+    def _getshape(self):
+        return (len(self),)
 
     def __len__(self):
         self.knowcounts()
         return self.offsets[-1]
 
-    @property
-    def shape(self):
-        return self.type.shape
-
-    @property
-    def dtype(self):
-        return self.type.dtype
-
     def _slices(self):
         # perhaps this should be a (public) @staticmethod that finds the largest possible slices to serve no more than one chunk each from a set of ChunkedArrays
         self.knowcounts()
@@ -260,17 +270,7 @@ def _valid(self):
         for i, count in enumerate(self._counts):
             if count != len(self._chunks[i]):
                 raise ValueError("count[{0}] does not agree with len(chunk[{0}])".format(i))
-
-        tpe = self._type()
-        for i in range(len(self._types)):
-            if self._types[i] is None or self._types[i] is () or self._types[i] is tpe.to:
-                pass
-            elif self._types[i] == tpe.to:    # valid if all chunks have the same high-level type
-                self._types[i] = tpe.to       # once checked, make them identically equal for faster checking next time
-            else:
-                raise TypeError("chunks do not have matching types:\n\n{0}\n\nversus\n\n{1}".format(tpe.to.__str__(indent="    "), self._types[i].__str__(indent="    ")))
-
-        return tpe
+        self._gettype({})
 
     def __str__(self):
         if self.countsknown:
@@ -633,6 +633,23 @@ def _mine(self, overrides):
         mine["dtype"] = overrides.pop("dtype", self._dtype)
         return mine
 
+    def __awkward_persist__(self, ident, fill, **kwargs):
+        self._valid()
+        n = self.__class__.__name__
+
+        chunks = []
+        for c, x in zip(self._counts, self._chunks):
+            if 0 < c < len(x):
+                chunks.append(x[:c])
+            elif 0 < c:
+                chunks.append(x)
+
+        return {"id": ident,
+                "call": ["awkward", n],
+                "args": [{"tuple": list(self._chunkshape)},
+                         {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]},
+                         {"list": [fill(x, n + ".chunk", **kwargs) for x in chunks]}]}
+
     @property
     def chunkshape(self):
         return self._chunkshape
@@ -644,7 +661,7 @@ def chunkshape(self, value):
         else:
             try:
                 for x in value:
-                    assert isinstance(x, awkward.util.integer) and value > 0
+                    assert isinstance(x, awkward.util.integer) and x > 0
             except TypeError:
                 raise TypeError("chunkshape must be an integer or a tuple of integers")
             except AssertionError:
@@ -696,13 +713,11 @@ def offsets(self):
         import awkward.array.jagged
         return awkward.array.jagged.counts2offsets(self._counts)
 
-    @property
-    def type(self):
-        return awkward.type.ArrayType(*(self.shape + (self._dtype,)))
+    def _gettype(self, seen):
+        return self._dtype
 
-    @property
-    def shape(self):
-        return (len(self),) + self._chunkshape[1:]
+    def _getshape(self):
+        return sum(self._counts)
 
     def _valid(self):
         pass