diff --git a/awkward/__init__.py b/awkward/__init__.py index d935910f..6606ebe8 100644 --- a/awkward/__init__.py +++ b/awkward/__init__.py @@ -37,11 +37,13 @@ from awkward.array.union import UnionArray from awkward.array.virtual import VirtualArray +from awkward.derived.strings import StringArray + from awkward.generate import fromiter -from awkward.persist import serialize, deserialize, save, load, tohdf5, fromhdf5 +from awkward.persist import serialize, deserialize, save, load, hdf5 # convenient access to the version number from awkward.version import __version__ -__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "fromiter", "serialize", "deserialize", "save", "load", "tohdf5", "fromhdf5", "__version__"] +__all__ = ["ChunkedArray", "AppendableArray", "IndexedArray", "ByteIndexedArray", "SparseArray", "JaggedArray", "ByteJaggedArray", "MaskedArray", "BitMaskedArray", "IndexedMaskedArray", "Methods", "ObjectArray", "Table", "UnionArray", "VirtualArray", "StringArray", "fromiter", "serialize", "deserialize", "save", "load", "hdf5", "__version__"] diff --git a/awkward/array/base.py b/awkward/array/base.py index 532243e9..fed473f9 100644 --- a/awkward/array/base.py +++ b/awkward/array/base.py @@ -35,6 +35,10 @@ import awkward.util class AwkwardArray(awkward.util.NDArrayOperatorsMixin): + """ + AwkwardArray: abstract base class + """ + def __array__(self, *args, **kwargs): # hitting this function is usually undesirable; uncomment to search for performance bugs # raise Exception("{0} {1}".format(args, kwargs)) @@ -187,6 +191,10 @@ def minby(self, function): return self[function(*args, **kwargs).argmin()] class AwkwardArrayWithContent(AwkwardArray): + """ + AwkwardArrayWithContent: abstract base class + """ + def __setitem__(self, where, what): if isinstance(where, awkward.util.string): self._content[where] = what diff --git a/awkward/array/chunked.py b/awkward/array/chunked.py index 344303b2..9fa37caf 100644 --- a/awkward/array/chunked.py +++ b/awkward/array/chunked.py @@ -34,6 +34,10 @@ import awkward.util class ChunkedArray(awkward.array.base.AwkwardArray): + """ + ChunkedArray + """ + def __init__(self, chunks, counts=[]): self.chunks = chunks self.counts = counts @@ -77,14 +81,13 @@ def ones_like(self, **overrides): mine = self._mine(overrides) return self.copy([awkward.util.numpy.ones_like(x) if isinstance(x, awkward.util.numpy.ndarray) else x.ones_like(**overrides) for x in self._chunks], counts=list(self._counts), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self.knowcounts() self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [{"list": [fill(x, n + ".chunk", **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, - fill(awkward.util.numpy.array([c for c in self._counts if c > 0]), n + ".counts", **kwargs)]} + "call": ["awkward", self.__class__.__name__], + "args": [{"list": [fill(x, self.__class__.__name__ + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for c, x in zip(self._counts, self._chunks) if c > 0]}, + {"json": [int(c) for c in self._counts if c > 0]}]} @property def chunks(self): @@ -611,6 +614,10 @@ def pandas(self): raise NotImplementedError class AppendableArray(ChunkedArray): + """ + AppendableArray + """ + def __init__(self, chunkshape, dtype, chunks=[]): self.chunkshape = chunkshape self.dtype = dtype @@ -633,10 +640,9 @@ def _mine(self, overrides): mine["dtype"] = overrides.pop("dtype", self._dtype) return mine - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ - + chunks = [] for c, x in zip(self._counts, self._chunks): if 0 < c < len(x): @@ -645,10 +651,10 @@ def __awkward_persist__(self, ident, fill, **kwargs): chunks.append(x) return {"id": ident, - "call": ["awkward", n], - "args": [{"tuple": list(self._chunkshape)}, - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}, - {"list": [fill(x, n + ".chunk", **kwargs) for x in chunks]}]} + "call": ["awkward", self.__class__.__name__], + "args": [{"tuple": [{"json": int(x)} for x in self._chunkshape]}, + {"dtype": awkward.persist.dtype2json(self._dtype)}, + {"list": [fill(x, self.__class__.__name__ + ".chunk", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in chunks]}]} @property def chunkshape(self): diff --git a/awkward/array/indexed.py b/awkward/array/indexed.py index c7cfa9d5..a90702fa 100644 --- a/awkward/array/indexed.py +++ b/awkward/array/indexed.py @@ -46,6 +46,10 @@ def invert(permutation): return out class IndexedArray(awkward.array.base.AwkwardArrayWithContent): + """ + IndexedArray + """ + def __init__(self, index, content): self.index = index self.content = content @@ -87,13 +91,12 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides)) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs)]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def index(self): @@ -205,6 +208,10 @@ def pandas(self): return self._content[self._index].pandas() class ByteIndexedArray(IndexedArray): + """ + ByteIndexedArray + """ + def __init__(self, index, content, dtype): super(ByteIndexedArray, self).__init__(index, content) self.dtype = dtype @@ -258,14 +265,13 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._dtype)]}]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"dtype": awkward.persist.dtype2json(self._dtype)}]} @property def content(self): @@ -365,6 +371,10 @@ def pandas(self): raise NotImplementedError class SparseArray(awkward.array.base.AwkwardArrayWithContent): + """ + SparseArray + """ + def __init__(self, length, index, content, default=None): self.length = length self.index = index @@ -423,22 +433,23 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ - if self._default is None or isinstance(self._default, (numbers.Real, awkward.util.numpy.integer, awkward.util.numpy.floating)): - default = self._default - elif isinstance(self._default, awkward.util.numpy.ndarray): - default = fill(self._default, n + ".default") + if self._default is None: + default = {"json": self._default} + elif isinstance(self._default, (numbers.Integral, awkward.util.numpy.integer)): + default = {"json": int(self._default)} + elif isinstance(self._default, (numbers.Real, awkward.util.numpy.floating)) and awkward.util.numpy.isfinite(self._default): + default = {"json": float(self._default)} else: - default = {"call": ["pickle", "loads"], "args": pickle.dumps(self._default)} + default = fill(self._default, self.__class__.__name__ + ".default", prefix, suffix, schemasuffix, storage, compression, **kwargs) return {"id": ident, - "call": ["awkward", n], - "args": [self._length, - fill(self._index, n + ".index", **kwargs), - fill(self._content, n + ".content", **kwargs), + "call": ["awkward", self.__class__.__name__], + "args": [{"json": int(self._length)}, + fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), default]} @property diff --git a/awkward/array/jagged.py b/awkward/array/jagged.py index ed4a405e..888adce8 100644 --- a/awkward/array/jagged.py +++ b/awkward/array/jagged.py @@ -102,6 +102,10 @@ def uniques2offsetsparents(uniques): return offsets, parents class JaggedArray(awkward.array.base.AwkwardArrayWithContent): + """ + JaggedArray + """ + def __init__(self, starts, stops, content): self.starts = starts self.stops = stops @@ -234,21 +238,19 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides)) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, - "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", **kwargs), - fill(self._content, n + ".content", **kwargs)]} - + "call": ["awkward", self.__class__.__name__, "fromcounts"], + "args": [fill(self.counts, self.__class__.__name__ + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", **kwargs), - fill(self._stops, n + ".stops", **kwargs), - fill(self._content, n + ".content", **kwargs)]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._starts, self.__class__.__name__ + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, self.__class__.__name__ + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)]} @property def starts(self): @@ -663,19 +665,17 @@ def recurse(x): for i in range(len(inputs)): if isinstance(inputs[i], JaggedArray): - if good is None: - inputs[i] = inputs[i].content - else: - inputs[i] = inputs[i].content[good] + inputs[i] = inputs[i].flatten() result = getattr(ufunc, method)(*inputs, **kwargs) + counts = stops - starts if isinstance(result, tuple): - return tuple(awkward.array.objects.Methods.maybemixin(type(x), JaggedArray)(starts, stops, x) if isinstance(x, (awkward.util.numpy.ndarray, awkward.array.base.AwkwardBase)) else x for x in result) + return tuple(awkward.array.objects.Methods.maybemixin(type(x), JaggedArray).fromcounts(counts, x) if isinstance(x, (awkward.util.numpy.ndarray, awkward.array.base.AwkwardBase)) else x for x in result) elif method == "at": return None else: - return awkward.array.objects.Methods.maybemixin(type(result), JaggedArray)(starts, stops, result) + return awkward.array.objects.Methods.maybemixin(type(result), JaggedArray).fromcounts(counts, result) @staticmethod def aligned(*jaggedarrays): @@ -1093,6 +1093,10 @@ def pandas(self): return out class ByteJaggedArray(JaggedArray): + """ + ByteJaggedArray + """ + def __init__(self, starts, stops, content, subdtype): super(ByteJaggedArray, self).__init__(starts, stops, content) self.subdtype = subdtype @@ -1145,23 +1149,21 @@ def deepcopy(self, starts=None, stops=None, content=None, subdtype=None): out.subdtype = subdtype return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if offsetsaliased(self._starts, self._stops) and len(self._starts) > 0 and self._starts[0] == 0: return {"id": ident, - "call": ["awkward", n, "fromcounts"], - "args": [fill(self.counts, n + ".counts", **kwargs), - fill(self._content, n + ".content", **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} - + "call": ["awkward", self.__class__.__name__, "fromcounts"], + "args": [fill(self.counts, self.__class__.__name__ + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"dtype": awkward.persist.dtype2json(self._subdtype)}]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._starts, n + ".starts", **kwargs), - fill(self._stops, n + ".stops", **kwargs), - fill(self._content, n + ".content", **kwargs), - {"call": ["awkward.persist", "json2dtype"], "args": [awkward.persist.dtype2json(self._subdtype)]}]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._starts, self.__class__.__name__ + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._stops, self.__class__.__name__ + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"dtype": awkward.persist.dtype2json(self._subdtype)}]} @property def content(self): diff --git a/awkward/array/masked.py b/awkward/array/masked.py index f09d57ec..e470100b 100644 --- a/awkward/array/masked.py +++ b/awkward/array/masked.py @@ -36,6 +36,10 @@ import awkward.util class MaskedArray(awkward.array.base.AwkwardArrayWithContent): + """ + MaskedArray + """ + ### WTF were the designers of numpy.ma thinking? # @staticmethod # def is_masked(x): @@ -106,14 +110,13 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), - self._maskedwhen]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"json": bool(self._maskedwhen)}]} @property def mask(self): @@ -279,6 +282,10 @@ def pandas(self): raise NotImplementedError class BitMaskedArray(MaskedArray): + """ + BitMaskedArray + """ + def __init__(self, mask, content, maskedwhen=True, lsborder=False): super(BitMaskedArray, self).__init__(mask, content, maskedwhen=maskedwhen) self.lsborder = lsborder @@ -300,15 +307,14 @@ def _mine(self, overrides): mine["lsborder"] = overrides.pop("lsborder", self._lsborder) return mine - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), - self._maskedwhen, - self._lsborder]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"json": bool(self._maskedwhen)}, + {"json": bool(self._lsborder)}]} @property def mask(self): @@ -507,6 +513,10 @@ def pandas(self): raise NotImplementedError class IndexedMaskedArray(MaskedArray): + """ + IndexedMaskedArray + """ + def __init__(self, mask, content, maskedwhen=-1): super(IndexedMaskedArray, self).__init__(mask, content, maskedwhen=maskedwhen) self._isvalid = False @@ -525,14 +535,13 @@ def copy(self, mask=None, content=None, maskedwhen=None): out._maskedwhen = maskedwhen return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._mask, n + ".mask", **kwargs), - fill(self._content, n + ".content", **kwargs), - self._maskedwhen]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._mask, self.__class__.__name__ + ".mask", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"json": int(self._maskedwhen)}]} @property def mask(self): diff --git a/awkward/array/objects.py b/awkward/array/objects.py index f90ff978..31fd10ce 100644 --- a/awkward/array/objects.py +++ b/awkward/array/objects.py @@ -29,13 +29,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import importlib -import pickle import awkward.array.base +import awkward.persist import awkward.type import awkward.util class Methods(object): + """ + Methods: abstract mix-in + """ + @staticmethod def mixin(methods, awkwardtype): assert issubclass(methods, Methods) @@ -55,7 +59,11 @@ def maybemixin(sample, awkwardtype): return awkwardtype class ObjectArray(awkward.array.base.AwkwardArrayWithContent): - def __init__(self, content, generator, *args, **kwargs): + """ + ObjectArray + """ + + def __init__(self, content, generator, args=(), kwargs={}): self.content = content self.generator = generator self.args = args @@ -112,32 +120,14 @@ def ones_like(self, **overrides): else: return self.copy(content=self._content.ones_like(**overrides), **mine) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - - if self._generator.__module__ == "__main__": - raise TypeError("cannot persist ObjectArray: its generator is defined in __main__, which won't be available in a subsequent session") - if hasattr(self._generator, "__qualname__"): - spec = [self._generator.__module__] + self._generator.__qualname__.split(".") - else: - spec = [self._generator.__module__, self._generator.__name__] - - gen, genname = importlib.import_module(spec[0]), spec[1:] - while len(genname) > 0: - gen, genname = getattr(gen, genname[0]), genname[1:] - if gen is not self._generator: - raise TypeError("cannot persist ObjectArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - - n = self.__class__.__name__ - out = {"id": ident, - "call": ["awkward", n], - "args": [fill(self._content, n + ".content", **kwargs), {"function": spec}]} - if len(self._args) > 0: - out["*"] = {"call": ["pickle", "loads"], "args": [pickle.dumps(self._args)]} - if len(self._kwargs) > 0: - out["**"] = {"call": ["pickle", "loads"], "args": [pickle.dumps(self._kwargs)]} - - return out + return {"id": ident, + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._content, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._generator, self.__class__.__name__ + ".generator", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}]} @property def content(self): @@ -175,16 +165,13 @@ def kwargs(self): def kwargs(self, value): if not isinstance(value, dict): raise TypeError("kwargs must be a dict") - self._kwargs = value + self._kwargs = dict(value) def __len__(self): return len(self._content) def _gettype(self, seen): - if len(self._content.shape) == 1: - return self._generator - else: - return awkward.type.ArrayType(*(self._content.shape[1:] + (self._generator,))) + return self._generator def _getshape(self): return (len(self._content),) diff --git a/awkward/array/table.py b/awkward/array/table.py index 32218e4c..753accec 100644 --- a/awkward/array/table.py +++ b/awkward/array/table.py @@ -35,9 +35,17 @@ import awkward.util class Table(awkward.array.base.AwkwardArray): + """ + Table + """ + ##################### class Row class Row(object): + """ + Table.Row + """ + __slots__ = ["_table", "_index"] def __init__(self, table, index): @@ -234,11 +242,11 @@ def ones_like(self, **overrides): out[n] = x.ones_like(**overrides) return out - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() return {"id": ident, "call": ["awkward", self.__class__.__name__, "frompairs"], - "args": [{"pairs": [[n, fill(x, self.__class__.__name__ + ".content", **kwargs)] for n, x in self._content.items()]}]} + "args": [{"pairs": [[n, fill(x, self.__class__.__name__ + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs)] for n, x in self._content.items()]}]} @property def base(self): diff --git a/awkward/array/union.py b/awkward/array/union.py index 34eb0e5b..0be9f873 100644 --- a/awkward/array/union.py +++ b/awkward/array/union.py @@ -33,6 +33,10 @@ import awkward.util class UnionArray(awkward.array.base.AwkwardArray): + """ + UnionArray + """ + def __init__(self, tags, index, contents): self.tags = tags self.index = index @@ -95,21 +99,20 @@ def issequential(self): return False return True - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ if self.issequential: return {"id": ident, - "call": ["awkward", n, "fromtags"], - "args": [fill(self._tags, n + ".tags", **kwargs), - {"list": [fill(x, n + ".contents", **kwargs) for x in self._contents]}]} + "call": ["awkward", self.__class__.__name__, "fromtags"], + "args": [fill(self._tags, self.__class__.__name__ + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, self.__class__.__name__ + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} else: return {"id": ident, - "call": ["awkward", n], - "args": [fill(self._tags, n + ".tags", **kwargs), - fill(self._index, n + ".index", **kwargs), - {"list": [fill(x, n + ".contents", **kwargs) for x in self._contents]}]} + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._tags, self.__class__.__name__ + ".tags", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self._index, self.__class__.__name__ + ".index", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"list": [fill(x, self.__class__.__name__ + ".contents", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._contents]}]} @property def tags(self): diff --git a/awkward/array/virtual.py b/awkward/array/virtual.py index e7de4a98..78febb5e 100644 --- a/awkward/array/virtual.py +++ b/awkward/array/virtual.py @@ -36,6 +36,10 @@ import awkward.util class VirtualArray(awkward.array.base.AwkwardArray): + """ + VirtualArray + """ + class TransientKey(object): def __init__(self, id): self._id = id @@ -50,8 +54,10 @@ def __ne__(self, other): def __getstate__(self): raise RuntimeError("VirtualArray.TransientKeys are not unique across processes, and hence should not be serialized") - def __init__(self, generator, cache=None, persistentkey=None, type=None, persistvirtual=True): + def __init__(self, generator, args=(), kwargs={}, cache=None, persistentkey=None, type=None, persistvirtual=True): self.generator = generator + self.args = args + self.kwargs = kwargs self.cache = cache self.persistentkey = persistentkey self.type = type @@ -60,9 +66,11 @@ def __init__(self, generator, cache=None, persistentkey=None, type=None, persist self._setitem = None self._delitem = None - def copy(self, generator=None, cache=None, persistentkey=None, type=None, persistvirtual=None): + def copy(self, generator=None, args=None, kwargs=None, cache=None, persistentkey=None, type=None, persistvirtual=None): out = self.__class__.__new__(self.__class__) out._generator = self._generator + out._args = self._args + out._kwargs = self._kwargs out._cache = self._cache out._persistentkey = self._persistentkey out._type = self._type @@ -78,6 +86,10 @@ def copy(self, generator=None, cache=None, persistentkey=None, type=None, persis out._delitem = list(self._delitem) if generator is not None: out.generator = generator + if args is not None: + out.args = args + if kwargs is not None: + out.kwargs = kwargs if cache is not None: out.cache = cache if persistentkey is not None: @@ -88,8 +100,8 @@ def copy(self, generator=None, cache=None, persistentkey=None, type=None, persis out.persistvirtual = persistvirtual return out - def deepcopy(self, generator=None, cache=None, persistentkey=None, type=None, persistvirtual=None): - out = self.copy(generator=generator, cache=cache, persistentkey=persistentkey, type=type, persistvirtual=persistvirtual) + def deepcopy(self, generator=None, args=None, kwargs=None, cache=None, persistentkey=None, type=None, persistvirtual=None): + out = self.copy(generator=generator, args=arge, kwargs=kwargs, cache=cache, persistentkey=persistentkey, type=type, persistvirtual=persistvirtual) out._array = awkward.util.deepcopy(out._array) if out._setitem is not None: for n in list(out._setitem): @@ -114,39 +126,31 @@ def ones_like(self, **overrides): else: return self.array.ones_like(**overrides) - def __awkward_persist__(self, ident, fill, **kwargs): + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): self._valid() - n = self.__class__.__name__ - + if self._persistvirtual: - if self._generator.__module__ == "__main__": - raise TypeError("cannot persist VirtualArray: its generator is defined in __main__, which won't be available in a subsequent session") - if hasattr(self._generator, "__qualname__"): - spec = [self._generator.__module__] + self._generator.__qualname__.split(".") - else: - spec = [self._generator.__module__, self._generator.__name__] - - gen, genname = importlib.import_module(spec[0]), spec[1:] - while len(genname) > 0: - gen, genname = getattr(gen, genname[0]), genname[1:] - if gen is not self._generator: - raise TypeError("cannot persist VirtualArray: its generator cannot be found via its __name__ (Python 2) or __qualname__ (Python 3)") - out = {"id": ident, - "call": ["awkward", n], - "args": [{"function": spec}], + "call": ["awkward", self.__class__.__name__], + "args": [fill(self._generator, self.__class__.__name__ + ".generator", prefix, suffix, schemasuffix, storage, compression, **kwargs), + {"tuple": [fill(x, self.__class__.__name__ + ".args", prefix, suffix, schemasuffix, storage, compression, **kwargs) for x in self._args]}, + {"dict": {n: fill(x, self.__class__.__name__ + ".kwargs", prefix, suffix, schemasuffix, storage, compression, **kwargs) for n, x in self._kwargs.items()}}], "cacheable": True} others = {} if self._persistentkey is not None: - others["persistentkey"] = self._persistentkey + try: + others["persistentkey"] = {"json": awkward.persist.jsonable(self._persistentkey)} + except TypeError: + others["persistentkey"] = {"python": awkward.persist.frompython(self._persistentkey)} + if self._type is not None: - others["type"] = {"call": ["awkward.persist", "json2type"], "args": [awkward.persist.type2json(self._type)], "whitelistable": True} + others["type"] = {"call": ["awkward.persist", "json2type"], "args": [{"json": awkward.persist.type2json(self._type)}], "whitelistable": True} if len(others) > 0: out["kwargs"] = others return out else: - return fill(self.array, n + ".array", **kwargs) + return fill(self.array, self.__class__.__name__ + ".array", prefix, suffix, schemasuffix, storage, compression, **kwargs) @property def generator(self): @@ -155,9 +159,30 @@ def generator(self): @generator.setter def generator(self, value): if not callable(value): - raise TypeError("generator must be a callable (of zero arguments)") + raise TypeError("generator must be a callable") self._generator = value + + @property + def args(self): + return self._args + + @args.setter + def args(self, value): + if not isinstance(value, tuple): + value = (value,) + self._args = value + + @property + def kwargs(self): + return self._kwargs + + @kwargs.setter + def kwargs(self, value): + if not isinstance(value, dict): + raise TypeError("kwargs must be a dict") + self._kwargs = dict(value) + @property def cache(self): return self._cache @@ -269,7 +294,7 @@ def ismaterialized(self): return self._array is not None and self._array in self._cache def materialize(self): - array = awkward.util.toarray(self._generator(), awkward.util.DEFAULTTYPE) + array = awkward.util.toarray(self._generator(*self._args, **self._kwargs), awkward.util.DEFAULTTYPE) if self._setitem is not None: for n, x in self._setitem.items(): array[n] = x @@ -277,8 +302,11 @@ def materialize(self): for n in self._delitem: del array[n] - if self._type is not None and self._type != awkward.type.fromarray(array): - raise TypeError("materialized array has type\n\n{0}\n\nexpected type\n\n{1}".format(awkward.type._str(awkward.type.fromarray(array), indent=" "), awkward.type._str(self._type, indent=" "))) + if self._type is not None: + materializedtype = awkward.type.fromarray(array) + if ((isinstance(self._type, awkward.type.Type) and not self._type._eq(materializedtype, set(), ignoremask=True)) or + (not isinstance(self._type, awkward.type.Type) and not self._type == materializedtype)): + raise TypeError("materialized array has type\n\n{0}\n\nexpected type\n\n{1}".format(awkward.type._str(awkward.type.fromarray(array), indent=" "), awkward.type._str(self._type, indent=" "))) if self._cache is None: # states (1), (2), and (6) diff --git a/awkward/arrow.py b/awkward/arrow.py new file mode 100644 index 00000000..4c3bcc58 --- /dev/null +++ b/awkward/arrow.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import awkward.array.chunked +import awkward.array.indexed +import awkward.array.jagged +import awkward.array.masked +import awkward.array.table +import awkward.array.virtual +import awkward.derived.strings +import awkward.type +import awkward.util + +ARROW_BITMASKTYPE = awkward.util.numpy.uint8 +ARROW_INDEXTYPE = awkward.util.numpy.int32 +ARROW_TAGTYPE = awkward.util.numpy.uint8 +ARROW_CHARTYPE = awkward.util.numpy.uint8 + +def schema2type(schema): + import pyarrow + + def recurse(tpe, nullable): + if isinstance(tpe, pyarrow.lib.DictionaryType): + out = recurse(tpe.dictionary.type, nullable) + if nullable: + return awkward.type.OptionType(out) + else: + return out + + elif isinstance(tpe, pyarrow.lib.StructType): + out = None + for i in range(tpe.num_children): + x = awkward.type.ArrayType(tpe[i].name, recurse(tpe[i].type, tpe[i].nullable)) + if out is None: + out = x + else: + out = out & x + if nullable: + return awkward.type.OptionType(out) + else: + return out + + elif isinstance(tpe, pyarrow.lib.ListType): + out = awkward.type.ArrayType(float("inf"), recurse(tpe.value_type, nullable)) + if nullable: + return awkward.type.OptionType(out) + else: + return out + + elif isinstance(tpe, pyarrow.lib.UnionType): + out = None + for i in range(tpe.num_children): + x = recurse(tpe[i].type, nullable) + if out is None: + out = x + else: + out = out | x + if nullable: + return awkward.type.OptionType(out) + else: + return out + + elif tpe == pyarrow.string(): + if nullable: + return awkward.type.OptionType(str) + else: + return str + + elif tpe == pyarrow.binary(): + if nullable: + return awkward.type.OptionType(bytes) + else: + return bytes + + elif tpe == pyarrow.bool_(): + out = awkward.util.numpy.dtype(bool) + if nullable: + return awkward.type.OptionType(out) + else: + return out + + elif isinstance(tpe, pyarrow.lib.DataType): + if nullable: + return awkward.type.OptionType(tpe.to_pandas_dtype()) + else: + return tpe.to_pandas_dtype() + + else: + raise NotImplementedError(repr(tpe)) + + out = None + for name in schema.names: + field = schema.field_by_name(name) + mytype = awkward.type.ArrayType(name, recurse(field.type, field.nullable)) + if out is None: + out = mytype + else: + out = out & mytype + + return out + +def view(obj): + import pyarrow + + def popbuffers(tpe, buffers): + if isinstance(tpe, pyarrow.lib.DictionaryType): + content = view(tpe.dictionary) + index = popbuffers(tpe.index_type, buffers) + if isinstance(index, awkward.array.masked.BitMaskedArray): + return awkward.array.masked.BitMaskedArray(index.mask, awkward.array.indexed.IndexedArray(index.content, content), maskedwhen=index.maskedwhen, lsborder=index.lsborder) + else: + return awkward.array.indexed.IndexedArray(index, content) + + elif isinstance(tpe, pyarrow.lib.StructType): + pairs = [] + for i in range(tpe.num_children - 1, -1, -1): + pairs.insert(0, (tpe[i].name, popbuffers(tpe[i].type, buffers))) + out = awkward.array.table.Table.frompairs(pairs) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif isinstance(tpe, pyarrow.lib.ListType): + content = popbuffers(tpe.value_type, buffers) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + out = awkward.array.jagged.JaggedArray.fromoffsets(offsets, content) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "sparse": + contents = [] + for i in range(tpe.num_children - 1, -1, -1): + contents.insert(0, popbuffers(tpe[i].type, buffers)) + assert buffers.pop() is None + tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) + index = awkward.util.numpy.arange(len(tags), dtype=ARROW_INDEXTYPE) + out = awkward.array.union.UnionArray(tags, index, contents) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif isinstance(tpe, pyarrow.lib.UnionType) and tpe.mode == "dense": + contents = [] + for i in range(tpe.num_children - 1, -1, -1): + contents.insert(0, popbuffers(tpe[i].type, buffers)) + index = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + tags = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_TAGTYPE) + out = awkward.array.union.UnionArray(tags, index, contents) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif tpe == pyarrow.string(): + content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + out = awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding="utf-8") + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif tpe == pyarrow.binary(): + content = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE) + offsets = awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_INDEXTYPE) + out = awkward.derived.strings.StringArray.fromoffsets(offsets, content, encoding=None) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif tpe == pyarrow.bool_(): + out = awkward.util.numpy.unpackbits(awkward.util.numpy.frombuffer(buffers.pop(), dtype=ARROW_CHARTYPE)).view(awkward.util.BOOLTYPE) + out = out.reshape(-1, 8)[:,::-1].reshape(-1) # lsborder=True + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + elif isinstance(tpe, pyarrow.lib.DataType): + out = awkward.util.numpy.frombuffer(buffers.pop(), dtype=tpe.to_pandas_dtype()) + mask = buffers.pop() + if mask is not None: + mask = awkward.util.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE) + return awkward.array.masked.BitMaskedArray(mask, out, maskedwhen=False, lsborder=True) + else: + return out + + else: + raise NotImplementedError(repr(tpe)) + + if isinstance(obj, pyarrow.lib.Array): + buffers = obj.buffers() + out = popbuffers(obj.type, buffers)[:len(obj)] + assert len(buffers) == 0 + return out + + elif isinstance(obj, pyarrow.lib.ChunkedArray): + chunks = [x for x in obj.chunks if len(x) > 0] + if len(chunks) == 1: + return chunks[0] + else: + return awkward.array.chunked.ChunkedArray([view(x) for x in chunks], counts=[len(x) for x in chunks]) + + elif isinstance(obj, pyarrow.lib.RecordBatch): + out = awkward.array.table.Table() + for n, x in zip(obj.schema.names, obj.columns): + out[n] = view(x) + return out + + elif isinstance(obj, pyarrow.lib.Table): + chunks = [] + counts = [] + for batch in obj.to_batches(): + chunk = view(batch) + if len(chunk) > 0: + chunks.append(chunk) + counts.append(len(chunk)) + if len(chunks) == 1: + return chunks[0] + else: + return awkward.array.chunked.ChunkedArray(chunks, counts=counts) + + else: + raise NotImplementedError(type(obj)) + +class ParquetFile(object): + def __init__(self, file, cache=None, metadata=None, common_metadata=None): + self.file = file + self.cache = cache + self.metadata = metadata + self.common_metadata = common_metadata + self._init() + + def _init(self): + import pyarrow.parquet + self.parquetfile = pyarrow.parquet.ParquetFile(self.file, metadata=self.metadata, common_metadata=self.common_metadata) + self.type = schema2type(self.parquetfile.schema.to_arrow_schema()) + + def __getstate__(self): + return {"file": self.file, "metadata": self.metadata, "common_metadata": self.common_metadata} + + def __setstate__(self, state): + self.file = state["file"] + self.cache = None + self.metadata = state["metadata"] + self.common_metadata = state["common_metadata"] + self._init() + + def __call__(self, rowgroup, column): + return view(self.parquetfile.read_row_group(rowgroup, columns=[column]))[column] + +def fromparquet(file, cache=None, persistvirtual=False, metadata=None, common_metadata=None): + parquetfile = ParquetFile(file, cache=cache, metadata=metadata, common_metadata=common_metadata) + columns = parquetfile.type.columns + + chunks = [] + counts = [] + for i in range(parquetfile.parquetfile.num_row_groups): + numrows = parquetfile.parquetfile.metadata.row_group(i).num_rows + if numrows > 0: + chunk = awkward.array.table.Table() + for n in columns: + chunk[n] = awkward.array.virtual.VirtualArray(parquetfile, (i, n), cache=cache, type=awkward.type.ArrayType(numrows, parquetfile.type[n]), persistvirtual=persistvirtual) + chunks.append(chunk) + counts.append(numrows) + + return awkward.array.chunked.ChunkedArray(chunks, counts) diff --git a/awkward/derived/__init__.py b/awkward/derived/__init__.py new file mode 100644 index 00000000..e831ea82 --- /dev/null +++ b/awkward/derived/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/awkward/derived/strings.py b/awkward/derived/strings.py new file mode 100644 index 00000000..ab1798c1 --- /dev/null +++ b/awkward/derived/strings.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import codecs + +import awkward.util +import awkward.array.jagged +import awkward.array.objects + +class StringMethods(object): + """ + StringMethods + """ + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + if method != "__call__": + raise NotImplemented + + if ufunc is awkward.util.numpy.equal or ufunc is awkward.util.numpy.not_equal: + if len(inputs) < 2: + raise ValueError("invalid number of arguments") + left, right = inputs[0], inputs[1] + + if isinstance(left, (str, bytes)): + left = StringArray.fromstr(len(right), left) + elif isinstance(left, awkward.util.numpy.ndarray) and (left.dtype.kind == "U" or left.dtype.kind == "S"): + left = StringArray.fromnumpy(left) + elif isinstance(left, awkward.util.numpy.ndarray) and left.dtype == awkward.util.numpy.dtype(object): + left = StringArray.fromiter(left) + elif not isinstance(left, StringMethods): + return awkward.util.numpy.zeros(len(right), dtype=awkward.util.BOOLTYPE) + + if isinstance(right, (str, bytes)): + right = StringArray.fromstr(len(left), right) + elif isinstance(right, awkward.util.numpy.ndarray) and (right.dtype.kind == "U" or right.dtype.kind == "S"): + right = StringArray.fromnumpy(right) + elif isinstance(right, awkward.util.numpy.ndarray) and right.dtype == awkward.util.numpy.dtype(object): + right = StringArray.fromiter(right) + elif not isinstance(right, StringMethods): + return awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) + + left = awkward.array.jagged.JaggedArray(left.starts, left.stops, left.content) + right = awkward.array.jagged.JaggedArray(right.starts, right.stops, right.content) + + maybeequal = (left.counts == right.counts) + + leftmask = left[maybeequal] + rightmask = right[maybeequal] + + reallyequal = (leftmask == rightmask).count_nonzero() == leftmask.counts + + out = awkward.util.numpy.zeros(len(left), dtype=awkward.util.BOOLTYPE) + out[maybeequal] = reallyequal + + if ufunc is awkward.util.numpy.equal: + return out + else: + return awkward.util.numpy.logical_not(out) + + else: + return super(StringMethods, self).__array_ufunc__(ufunc, method, *inputs, **kwargs) + +def tostring(x, decoder): + if decoder is None: + return x.tostring() + else: + return decoder(x, errors="replace")[0] + +class StringArray(StringMethods, awkward.array.objects.ObjectArray): + """ + StringArray + """ + + def __init__(self, starts, stops, content, encoding="utf-8"): + self._content = awkward.array.jagged.ByteJaggedArray(starts, stops, content, awkward.util.CHARTYPE) + self._generator = tostring + self._kwargs = {} + self.encoding = encoding + + @classmethod + def fromstr(cls, length, string, encoding="utf-8"): + if encoding is not None: + encoder = codecs.getencoder(encoding) + string = encoder(string)[0] + content = awkward.util.numpy.empty(length * len(string), dtype=awkward.util.CHARTYPE) + for i, x in string: + content[0::length] = ord(x) + counts = awkward.util.numpy.empty(length, dtype=awkward.util.INDEXTYPE) + counts[:] = length + return cls.fromcounts(counts, content, encoding) + + @classmethod + def fromnumpy(cls, array): + if array.dtype.kind == "S": + encoding = None + elif array.dtype.kind == "U": + encoding = "utf-32le" + else: + raise TypeError("not a string array") + + starts = awkward.util.numpy.arange( 0, len(array) * array.dtype.itemsize, array.dtype.itemsize) + stops = awkward.util.numpy.arange(array.dtype.itemsize, (len(array) + 1) * array.dtype.itemsize, array.dtype.itemsize) + content = array.view(awkward.util.CHARTYPE) + + shorter = awkward.util.numpy.ones(len(array), dtype=awkward.util.BOOLTYPE) + if array.dtype.kind == "S": + for checkat in range(array.dtype.itemsize - 1, -1, -1): + shorter &= (content[checkat::array.dtype.itemsize] == 0) + stops[shorter] -= 1 + if not shorter.any(): + break + + elif array.dtype.kind == "U": + content2 = content.view(awkward.util.numpy.uint32) + itemsize2 = array.dtype.itemsize >> 2 # itemsize // 4 + for checkat in range(itemsize2 - 1, -1, -1): + shorter &= (content2[checkat::itemsize2] == 0) # all four bytes are zero + stops[shorter] -= 4 + if not shorter.any(): + break + + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray(starts, stops, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromiter(cls, iterable, encoding="utf-8"): + if encoding is None: + encoded = iterable + else: + encoder = codecs.getencoder(encoding) + encoded = [encoder(x)[0] for x in iterable] + counts = [len(x) for x in encoded] + content = awkward.util.numpy.empty(sum(counts), dtype=awkward.util.CHARTYPE) + i = 0 + for x in encoded: + content[i : i + len(x)] = awkward.util.numpy.frombuffer(x, dtype=awkward.util.CHARTYPE) + i += len(x) + return cls.fromcounts(counts, content, encoding) + + @classmethod + def fromoffsets(cls, offsets, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromoffsets(offsets, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromcounts(cls, counts, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromcounts(counts, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromparents(cls, parents, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromparents(parents, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromuniques(cls, uniques, content, encoding="utf-8"): + out = cls.__new__(cls) + out._content = awkward.array.jagged.ByteJaggedArray.fromuniques(uniques, content, awkward.util.CHARTYPE) + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + @classmethod + def fromjagged(cls, jagged, encoding="utf-8"): + if jagged.content.type.to != awkward.util.CHARTYPE: + raise TypeError("jagged array must have CHARTYPE ({0})".format(str(awkward.util.CHARTYPE))) + out = cls.__new__(cls) + out._content = jagged + out._generator = tostring + out._kwargs = {} + out.encoding = encoding + return out + + def copy(self, starts=None, stops=None, content=None, encoding=None): + out = self.__class__.__new__(self.__class__) + out._content = awkward.array.jagged.ByteJaggedArray(self.starts, self.stops, self.content, awkward.util.CHARTYPE) + out._generator = self._generator + out._args = self._args + out._kwargs = self._kwargs + out._encoding = self._encoding + if starts is not None: + out.starts = starts + if stops is not None: + out.stops = stops + if content is not None: + out.content = content + if encoding is not None: + out.encoding = encoding + return out + + def deepcopy(self, starts=None, stops=None, content=None, encoding=None): + out = self.copy(starts=starts, stops=stops, content=content, encoding=encoding) + out._content._starts = awkward.util.deepcopy(out._content._starts) + out._content._stops = awkward.util.deepcopy(out._content._stops) + out._content._content = awkward.util.deepcopy(out._content._content) + return out + + def empty_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.empty_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) + + def zeros_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.zeros_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) + + def ones_like(self, **overrides): + mine = {} + mine["encoding"] = overrides.pop("encoding", self._encoding) + jagged = self._content.ones_like(**overrides) + return self.copy(jagged.starts, jagged.stops, jagged.content, **mine) + + def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs): + self._valid() + n = self.__class__.__name__ + if awkward.array.jagged.offsetsaliased(self.starts, self.stops) and len(self.starts) > 0 and self.starts[0] == 0: + return {"id": ident, + "call": ["awkward", n, "fromcounts"], + "args": [fill(self.counts, n + ".counts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + self._encoding]} + else: + return {"id": ident, + "call": ["awkward", n], + "args": [fill(self.starts, n + ".starts", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.stops, n + ".stops", prefix, suffix, schemasuffix, storage, compression, **kwargs), + fill(self.content, n + ".content", prefix, suffix, schemasuffix, storage, compression, **kwargs), + self._encoding]} + + @property + def starts(self): + return self._content.starts + + @starts.setter + def starts(self, value): + self._content.starts = value + + @property + def stops(self): + return self._content.stops + + @stops.setter + def stops(self, value): + self._content.stops = value + + @property + def content(self): + return self._content.content + + @content.setter + def content(self, value): + self._content.content = value + + @property + def args(self): + return self._args + + @property + def kwargs(self): + return {} + + @property + def encoding(self): + return self._encoding + + @encoding.setter + def encoding(self, value): + if value is None: + decodefcn = None + else: + decodefcn = codecs.getdecoder(value) + self._encoding = value + self._args = (decodefcn,) + + @property + def offsets(self): + return self._content.offsets + + @property + def counts(self): + return self._content.counts + + @property + def parents(self): + return self._content.parents + + @property + def index(self): + return self._content.index + + def _gettype(self, seen): + if self._encoding is None: + return bytes + else: + return str + + def __getitem__(self, where): + if awkward.util.isstringslice(where): + raise IndexError("cannot index StringArray with string or sequence of strings") + + if isinstance(where, tuple) and len(where) == 0: + return self + if not isinstance(where, tuple): + where = (where,) + head, tail = where[0], where[1:] + + if isinstance(head, awkward.util.integer): + return super(StringArray, self).__getitem__(where) + + elif tail == (): + out = self._content[where] + return self.__class__(out.starts, out.stops, out.content, self.encoding) + + else: + out = self._content[where] + return self.__class__(out.starts, out.stops, out.content, self.encoding) diff --git a/awkward/persist.py b/awkward/persist.py index c167997d..4224715b 100644 --- a/awkward/persist.py +++ b/awkward/persist.py @@ -28,26 +28,26 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import base64 import fnmatch import importlib import json import numbers import os +import pickle import zipfile import zlib try: - from collections.abc import Mapping + from collections.abc import Mapping, MutableMapping except ImportError: - from collections import Mapping - -import numpy + from collections import Mapping, MutableMapping import awkward.type import awkward.util import awkward.version compression = [ - {"minsize": 8192, "types": [numpy.bool_, numpy.bool, numpy.integer], "contexts": "*", "pair": (zlib.compress, ("zlib", "decompress"))}, + {"minsize": 8192, "types": [awkward.util.numpy.bool_, awkward.util.numpy.bool, awkward.util.numpy.integer], "contexts": "*", "pair": (zlib.compress, ("zlib", "decompress"))}, ] partner = { @@ -60,6 +60,12 @@ ["awkward", "Table"], ["awkward.persist", "*"]] +def frompython(obj): + return base64.b64encode(pickle.dumps(obj)).decode("ascii") + +def topython(string): + return pickle.loads(base64.b64decode(string.encode("ascii"))) + def spec2function(obj, whitelist=whitelist): for white in whitelist: for n, p in zip(obj, white): @@ -91,7 +97,7 @@ def recurse(obj): return [recurse(x) for x in obj] else: return obj - return numpy.dtype(recurse(obj)) + return awkward.util.numpy.dtype(recurse(obj)) def type2json(obj): if isinstance(obj, awkward.type.Type): @@ -134,7 +140,7 @@ def recurse(obj): else: return out - elif isinstance(obj, numpy.dtype): + elif isinstance(obj, awkward.util.numpy.dtype): return {"dtype": dtype2json(obj)} elif callable(obj): @@ -205,8 +211,38 @@ def recurse(obj): return awkward.type._resolve(recurse(obj), {}) +def jsonable(obj): + if obj is None: + return obj + + elif isinstance(obj, dict) and all(isinstance(n, str) for n in obj): + return {n: jsonable(x) for n, x in obj.items()} + + elif isinstance(obj, list): + return [jsonable(x) for x in obj] + + elif isinstance(obj, str): + return str(obj) + + elif isinstance(obj, (bool, awkward.util.numpy.bool_, awkward.util.numpy.bool)): + return bool(obj) # policy: eliminate Numpy types + + elif isinstance(obj, (numbers.Integral, awkward.util.numpy.integer)): + return int(obj) # policy: eliminate Numpy types + + elif isinstance(obj, (numbers.Real, awkward.util.numpy.floating)) and awkward.util.numpy.finite(obj): + return float(obj) # policy: eliminate Numpy types + + else: + raise TypeError("object cannot be losslessly serialized as JSON") + def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix=None, compression=compression, **kwargs): import awkward.array.base + import awkward.array.virtual + + for n in kwargs: + if n not in (): + raise TypeError("unrecognized serialization option: {0}".format(repr(n))) if name is None or name == "": name = "" @@ -241,56 +277,81 @@ def serialize(obj, storage, name=None, delimiter="-", suffix=None, schemasuffix= x = {"pair": x} minsize = x.get("minsize", 0) - types = x.get("types", (object,)) - if not isinstance(types, tuple): - types = (types,) + tpes = x.get("types", (object,)) + if not isinstance(tpes, tuple): + tpes = (tpes,) contexts = x.get("contexts", "*") pair = x["pair"] - normalized.append({"minsize": minsize, "types": types, "contexts": contexts, "pair": pair}) + normalized.append({"minsize": minsize, "types": tpes, "contexts": contexts, "pair": pair}) seen = {} - def fill(obj, context, **kwargs): + def fill(obj, context, prefix, suffix, schemasuffix, storage, compression, **kwargs): if id(obj) in seen: return {"ref": seen[id(obj)]} ident = len(seen) seen[id(obj)] = ident - if type(obj) is numpy.ndarray and len(obj.shape) != 0: + if type(obj) is awkward.util.numpy.dtype: + return {"dtype": dtype2json(obj)} + + elif type(obj) is awkward.util.numpy.ndarray and len(obj.shape) != 0: if len(obj.shape) > 1: - dtype = dtype2json(numpy.dtype((obj.dtype, obj.shape[1:]))) + dtype = awkward.util.numpy.dtype((obj.dtype, obj.shape[1:])) else: - dtype = dtype2json(obj.dtype) + dtype = obj.dtype for policy in normalized: - minsize, types, contexts, pair = policy["minsize"], policy["types"], policy["contexts"], policy["pair"] - if obj.nbytes >= minsize and issubclass(obj.dtype.type, tuple(types)) and any(fnmatch.fnmatchcase(context, p) for p in contexts): + minsize, tpes, contexts, pair = policy["minsize"], policy["types"], policy["contexts"], policy["pair"] + if obj.nbytes >= minsize and issubclass(obj.dtype.type, tuple(tpes)) and any(fnmatch.fnmatchcase(context, p) for p in contexts): compress, decompress = pair storage[prefix + str(ident) + suffix] = compress(obj) return {"id": ident, "call": ["numpy", "frombuffer"], "args": [{"call": decompress, "args": [{"read": str(ident) + suffix}]}, - {"call": ["awkward.persist", "json2dtype"], "args": [dtype]}, - len(obj)]} + {"dtype": dtype2json(dtype)}, + {"json": len(obj)}]} else: storage[prefix + str(ident) + suffix] = obj.tostring() return {"id": ident, "call": ["numpy", "frombuffer"], "args": [{"read": str(ident) + suffix}, - {"call": ["awkward.persist", "json2dtype"], "args": [dtype]}, - len(obj)]} + {"dtype": dtype2json(dtype)}, + {"json": len(obj)}]} elif hasattr(obj, "__awkward_persist__"): - return obj.__awkward_persist__(ident, fill, **kwargs) + return obj.__awkward_persist__(ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs) else: - raise TypeError("cannot serialize {0} instance (has no __awkward_persist__ method)".format(type(obj))) + if hasattr(obj, "__module__") and (hasattr(obj, "__qualname__") or hasattr(obj, "__name__")) and obj.__module__ != "__main__": + if hasattr(obj, "__qualname__"): + spec = [obj.__module__] + obj.__qualname__.split(".") + else: + spec = [obj.__module__, obj.__name__] + + gen, genname = importlib.import_module(spec[0]), spec[1:] + while len(genname) > 0: + gen, genname = getattr(gen, genname[0]), genname[1:] + + if gen is obj: + return {"id": ident, "function": spec} + + try: + obj = jsonable(obj) + except TypeError: + try: + return {"id": ident, "python": awkward.persist.frompython(obj)} + + except Exception as err: + raise TypeError("could not persist component as an array, awkward-array, importable function/class, JSON, or pickle; pickle error is\n\n {0}: {1}".format(err.__class__.__name__, str(err))) + else: + return {"id": ident, "json": obj} schema = {"awkward": awkward.version.__version__, - "schema": fill(obj, "", **kwargs)} + "schema": fill(obj, "", prefix, suffix, schemasuffix, storage, compression, **kwargs)} if prefix != "": schema["prefix"] = prefix @@ -301,7 +362,7 @@ def deserialize(storage, name="", whitelist=whitelist, cache=None): import awkward.array.virtual schema = storage[name] - if isinstance(schema, numpy.ndarray): + if isinstance(schema, awkward.util.numpy.ndarray): schema = schema.tostring() if isinstance(schema, bytes): schema = schema.decode("ascii") @@ -335,43 +396,138 @@ def unfill(schema): kwargs.update({n: unfill(x) for n, x in schema["**"].items()}) out = gen(*args, **kwargs) - if "id" in schema: - seen[schema["id"]] = out - return out elif "read" in schema: if schema.get("absolute", False): - return storage[schema["read"]] + out = storage[schema["read"]] else: - return storage[prefix + schema["read"]] + out = storage[prefix + schema["read"]] elif "list" in schema: - return [unfill(x) for x in schema["list"]] + out = [unfill(x) for x in schema["list"]] elif "tuple" in schema: - return tuple(unfill(x) for x in schema["tuple"]) + out = tuple(unfill(x) for x in schema["tuple"]) elif "pairs" in schema: - return [(n, unfill(x)) for n, x in schema["pairs"]] + out = [(n, unfill(x)) for n, x in schema["pairs"]] + + elif "dict" in schema: + out = {n: unfill(x) for n, x in schema["dict"].items()} + + elif "dtype" in schema: + out = json2dtype(schema["dtype"]) elif "function" in schema: - return spec2function(schema["function"], whitelist=whitelist) + out = spec2function(schema["function"], whitelist=whitelist) + + elif "json" in schema: + out = schema["json"] + + elif "python" in schema: + out = topython(schema["python"]) elif "ref" in schema: if schema["ref"] in seen: - return seen[schema["ref"]] + out = seen[schema["ref"]] else: - return awkward.array.virtual.VirtualArray(lambda: seen[schema["ref"]]) - + out = awkward.array.virtual.VirtualArray(lambda: seen[schema["ref"]]) + else: - return schema + raise ValueError("unrecognized JSON object with fields {0}".format(", ".join(repr(x) for x in schema))) + + if "id" in schema: + seen[schema["id"]] = out + return out + + elif isinstance(schema, list): + raise ValueError("unrecognized JSON list with length {0}".format(len(schema))) else: - return schema + raise ValueError("unrecognized JSON object: {0}".format(repr(schema))) return unfill(schema["schema"]) -def save(file, mode="a", options=None, **arrays): +def keys(storage, name="", subschemas=True): + schema = storage[name] + if isinstance(schema, awkward.util.numpy.ndarray): + schema = schema.tostring() + if isinstance(schema, bytes): + schema = schema.decode("ascii") + schema = json.loads(schema) + + prefix = schema.get("prefix", "") + + def recurse(schema): + if isinstance(schema, dict): + if "call" in schema and isinstance(schema["call"], list) and len(schema["call"]) > 0: + for x in schema.get("args", []): + for y in recurse(x): + yield y + for x in schema.get("kwargs", {}).values(): + for y in recurse(x): + yield y + for x in schema.get("*", []): + for y in recurse(x): + yield y + for x in schema.get("**", {}).values(): + for y in recurse(x): + yield y + + elif "read" in schema: + if schema.get("absolute", False): + yield schema["read"] + else: + yield prefix + schema["read"] + + elif "list" in schema: + for x in schema["list"]: + for y in recurse(x): + yield y + + elif "tuple" in schema: + for x in schema["tuple"]: + for y in recurse(x): + yield y + + elif "pairs" in schema: + for n, x in schema["pairs"]: + for y in recurse(x): + yield y + + elif "dict" in schema: + for x in schema["dict"].values(): + for y in recurse(x): + yield y + + elif "dtype" in schema: + pass + + elif "function" in schema: + pass + + elif "json" in schema: + pass + + elif "python" in schema: + pass + + elif "ref" in schema: + pass + + yield name + for x in recurse(schema["schema"]): + yield x + +def save(file, array, name=None, mode="a", **options): + if isinstance(array, dict): + arrays = array + else: + arrays = {"": array} + + if name is not None: + arrays = {name + n: x for n, x in arrays.items()} + arraynames = list(arrays) for i in range(len(arraynames)): for j in range(i + 1, len(arraynames)): @@ -391,8 +547,7 @@ def save(file, mode="a", options=None, **arrays): file = file + ".akd" alloptions = {"delimiter": "-", "suffix": ".raw", "schemasuffix": ".json", "compression": compression} - if options is not None: - alloptions.update(options) + alloptions.update(options) options = alloptions class Wrap(object): @@ -411,26 +566,32 @@ def __setitem__(self, where, what): for name, array in arrays.items(): serialize(array, wrapped, name=name, **options) -class load(Mapping): - def __init__(self, file, options=None): +def load(file, **options): + f = Load(file, **options) + if list(f) == [""]: + out = f[""] + f.close() + return out + else: + return f + +class Load(Mapping): + def __init__(self, file, **options): class Wrap(object): def __init__(self): self.f = zipfile.ZipFile(file, mode="r") def __getitem__(self, where): return self.f.read(where) - def close(self): - self.f.close() self._file = Wrap() alloptions = {"schemasuffix": ".json", "whitelist": whitelist, "cache": None} - if options is not None: - alloptions.update(options) + alloptions.update(options) self.schemasuffix = alloptions.pop("schemasuffix") self.options = alloptions def __getitem__(self, where): - return deserialize(self._file, name=where + self.schemasuffix, **self.options) + return deserialize(self._file, name=where + self.schemasuffix, whitelist=self.options["whitelist"], cache=self.options["cache"]) def __iter__(self): for n in self._file.f.namelist(): @@ -444,57 +605,69 @@ def __len__(self): count += 1 return count + def __repr__(self): + return "".format(len(self)) + + def close(self): + self._file.f.close() + def __del__(self): - self._file.close() + self.close() -def tohdf5(group, options=None, **arrays): - alloptions = {"compression": compression} - if options is not None: - alloptions.update(options) - options = alloptions - options["delimiter"] = "/" - options["schemasuffix"] = "/schema.json" + def __enter__(self, *args, **kwds): + return self - class Wrap(object): - def __init__(self): - self.g = group - def __setitem__(self, where, what): - self.g[where] = numpy.frombuffer(what, dtype=numpy.uint8) + def __exit__(self, *args, **kwds): + self.close() - f = Wrap() - for name, array in arrays.items(): - group.create_group(name) - serialize(array, f, name=name, **options) +class hdf5(MutableMapping): + def __init__(self, group, **options): + alloptions = {"compression": compression, "whitelist": whitelist, "cache": None} + alloptions.update(options) + self.options = alloptions + self.options["delimiter"] = "/" + self.options["schemasuffix"] = "/schema.json" -class fromhdf5(Mapping): - def __init__(self, group, options=None): class Wrap(object): def __init__(self): self.g = group def __getitem__(self, where): return self.g[where].value + def __setitem__(self, where, what): + self.g[where] = awkward.util.numpy.frombuffer(what, dtype=awkward.util.numpy.uint8) self._group = Wrap() - alloptions = {"whitelist": whitelist, "cache": None} - if options is not None: - alloptions.update(options) - self.options = alloptions - def __getitem__(self, where): - return deserialize(self._group, name=where + "/schema.json", **self.options) + return deserialize(self._group, name=where + self.options["schemasuffix"], whitelist=self.options["whitelist"], cache=self.options["cache"]) + + def __setitem__(self, where, what): + options = dict(self.options) + if "whitelist" in options: + del options["whitelist"] + if "cache" in options: + del options["cache"] + self._group.g.create_group(where) + serialize(what, self._group, name=where, **options) + + def __delitem__(self, where): + for subname in keys(self._group, name=where + self.options["schemasuffix"]): + del self._group.g[subname] + del self._group.g[where] def __iter__(self): + schemaname = self.options["schemasuffix"].split("/")[-1] for subname in self._group.g: - if "schema.json" in self._group.g[subname]: + if schemaname in self._group.g[subname]: yield subname def __len__(self): + schemaname = self.options["schemasuffix"].split("/")[-1] count = 0 for subname in self._group.g: - if "schema.json" in self._group.g[subname]: - count += 0 + if schemaname in self._group.g[subname]: + count += 1 return count def __repr__(self): - return "".format(repr(self._group.g.name)) + return "".format(repr(self._group.g.name), len(self)) diff --git a/awkward/type.py b/awkward/type.py index b0092ed2..74064335 100644 --- a/awkward/type.py +++ b/awkward/type.py @@ -206,7 +206,7 @@ def __eq__(self, other): else: one = Type._canonical(Type._copy(self, {}), set()) two = Type._canonical(Type._copy(other, {}), set()) - return one._eq(two, set()) + return one._eq(two, set(), ignoremask=False) def __ne__(self, other): return not self.__eq__(other) @@ -329,14 +329,20 @@ def _substr(self, labeled, seen, indent): to = str(self._to) return takes + to - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, ArrayType) and self._takes == other._takes and self._to == other._to + if isinstance(other, ArrayType) and self._takes == other._takes: + if isinstance(self._to, Type): + return self._to._eq(other._to, seen, ignoremask=ignoremask) + else: + return self._to == other._to + else: + return False def __hash__(self): return hash((ArrayType, self._takes, self._to)) @@ -363,6 +369,10 @@ def dtype(self): out.append((n, x.dtype)) return awkward.util.numpy.dtype(out) + @property + def columns(self): + return list(self._fields) + def _isnumpy(self, seen): if id(self) in seen: return False @@ -413,14 +423,25 @@ def _substr(self, labeled, seen, indent): out.append(("{0}{1:%ds} -> {2}" % width).format(indent, repr(n), to)) return "\n".join(out).lstrip(" ") - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, TableType) and [(n, self._fields[n]) for n in sorted(self._fields)] == [(n, other._fields[n]) for n in sorted(other._fields)] + if isinstance(other, TableType) and sorted(self._fields) == sorted(other._fields): + for n in self._fields: + if isinstance(self._fields[n], Type): + if not self._fields[n]._eq(other._fields[n], seen, ignoremask=ignoremask): + return False + else: + if not self._fields[n] == other._fields[n]: + return False + else: + return True # nothing failed in the loop over fields + else: + return False def __hash__(self): return hash((TableType, tuple((n, self._fields[n]) for n in sorted(self._fields)))) @@ -483,14 +504,25 @@ def lstrip(x): out = [x + " " * (width - len(lstrip(x.split("\n")[-1]))) for x in subs] return "(" + (" |\n" + indent + " ").join(out) + " )" - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, UnionType) and set(self._possibilities) == set(other._possibilities) + if isinstance(other, UnionType) and len(self._possibilities) == len(other._possibilities): + for x, y in zip(sorted(self._possibilities), sorted(self._possibilities)): + if isinstance(x, Type): + if not x._eq(y, seen, ignoremask=ignoremask): + return False + else: + if not x == y: + return False + else: + return True # nothing failed in the loop over possibilities + else: + return False def __hash__(self): return hash((UnionType, tuple(self._possibilities))) @@ -508,7 +540,7 @@ def type(self, value): if isinstance(value, Type): self._type = value else: - self._type = self._finaltype(self._type) + self._type = self._finaltype(value) @property def shape(self): @@ -543,14 +575,22 @@ def _substr(self, labeled, seen, indent): type = str(self._type) return "?({0})".format(type) - def _eq(self, other, seen): + def _eq(self, other, seen, ignoremask=False): if self is other: return True elif id(self) in seen: return False else: seen.add(id(self)) - return isinstance(other, OptionType) and self._type == other._type + if isinstance(other, OptionType) and self._type._eq(other._type, seen, ignoremask=ignoremask): + return True + if ignoremask: # applied asymmetrically; only the left can ignore mask + if isinstance(self._type, Type): + return self._type._eq(other, seen, ignoremask=ignoremask) + else: + return self._type == other + else: + return False def __hash__(self): return hash((OptionType, self._type)) diff --git a/awkward/util.py b/awkward/util.py index a7a3477d..25f4d690 100644 --- a/awkward/util.py +++ b/awkward/util.py @@ -83,6 +83,7 @@ def is_intstring(x): TAGTYPE = numpy.dtype(numpy.uint8) MASKTYPE = numpy.dtype(numpy.bool_) BITMASKTYPE = numpy.dtype(numpy.uint8) +BOOLTYPE = numpy.dtype(numpy.bool_) def toarray(value, defaultdtype, passthrough=None): import awkward.array.base diff --git a/awkward/version.py b/awkward/version.py index 3ce216aa..af1a48e2 100644 --- a/awkward/version.py +++ b/awkward/version.py @@ -30,7 +30,7 @@ import re -__version__ = "0.3.0" +__version__ = "0.4.0" version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) diff --git a/tests/samples/features-0_11_1.parquet b/tests/samples/features-0_11_1.parquet new file mode 100644 index 00000000..ca8e09aa Binary files /dev/null and b/tests/samples/features-0_11_1.parquet differ diff --git a/tests/test_arrow.py b/tests/test_arrow.py new file mode 100644 index 00000000..78b9f42c --- /dev/null +++ b/tests/test_arrow.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python + +# Copyright (c) 2018, DIANA-HEP +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import numpy +try: + import pyarrow + import pyarrow.parquet +except ImportError: + pyarrow = None + +import awkward.arrow +from awkward import * + +class Test(unittest.TestCase): + def runTest(self): + pass + + def test_arrow_array(self): + if pyarrow is not None: + a = pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5] + + def test_arrow_boolean(self): + if pyarrow is not None: + a = pyarrow.array([True, True, False, False, True]) + assert awkward.arrow.view(a).tolist() == [True, True, False, False, True] + + def test_arrow_array_null(self): + if pyarrow is not None: + a = pyarrow.array([1.1, 2.2, 3.3, None, 4.4, 5.5]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, None, 4.4, 5.5] + + def test_arrow_nested_array(self): + if pyarrow is not None: + a = pyarrow.array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, 3.3], [], [4.4, 5.5]] + + def test_arrow_nested_nested_array(self): + if pyarrow is not None: + a = pyarrow.array([[[1.1, 2.2], [3.3], []], [], [[4.4, 5.5]]]) + assert awkward.arrow.view(a).tolist() == [[[1.1, 2.2], [3.3], []], [], [[4.4, 5.5]]] + + def test_arrow_nested_array_null(self): + if pyarrow is not None: + a = pyarrow.array([[1.1, 2.2, None], [], [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], [4.4, 5.5]] + + def test_arrow_null_nested_array_null(self): + if pyarrow is not None: + a = pyarrow.array([[1.1, 2.2, None], [], None, [4.4, 5.5]]) + assert awkward.arrow.view(a).tolist() == [[1.1, 2.2, None], [], None, [4.4, 5.5]] + + def test_arrow_chunked_array(self): + if pyarrow is not None: + a = pyarrow.chunked_array([pyarrow.array([1.1, 2.2, 3.3, 4.4, 5.5]), pyarrow.array([]), pyarrow.array([6.6, 7.7, 8.8])]) + assert awkward.arrow.view(a).tolist() == [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8] + + def test_arrow_struct(self): + if pyarrow is not None: + a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] + + def test_arrow_struct_null(self): + if pyarrow is not None: + a = pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] + + def test_arrow_null_struct(self): + if pyarrow is not None: + a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] + + def test_arrow_null_struct_null(self): + if pyarrow is not None: + a = pyarrow.array([{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, None, {"x": 2, "y": None}, {"x": 3, "y": 3.3}] + + def test_arrow_chunked_struct(self): + if pyarrow is not None: + a = pyarrow.chunked_array([pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]), pyarrow.array([]), pyarrow.array([{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}])]) + assert awkward.arrow.view(a).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}] + + def test_arrow_nested_struct(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] + + def test_arrow_nested_struct_null(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] + + def test_arrow_null_nested_struct(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] + + def test_arrow_null_nested_struct_null(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": 1, "y": 1.1}, {"x": 2, "y": None}, {"x": 3, "y": 3.3}], None, [], [{"x": 4, "y": 4.4}, {"x": 5, "y": 5.5}]] + + def test_arrow_struct_nested(self): + if pyarrow is not None: + a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}] + + def test_arrow_struct_nested_null(self): + if pyarrow is not None: + a = pyarrow.array([{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}]) + assert awkward.arrow.view(a).tolist() == [{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}] + + def test_arrow_nested_struct_nested(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [3, 3], "y": 3.3}], [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] + + def test_arrow_null_nested_struct_nested_null(self): + if pyarrow is not None: + a = pyarrow.array([[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]]) + assert awkward.arrow.view(a).tolist() == [[{"x": [], "y": 1.1}, {"x": [2], "y": 2.2}, {"x": [None, 3], "y": 3.3}], None, [], [{"x": [4, 4, 4], "y": 4.4}, {"x": [5, 5, 5, 5], "y": 5.5}]] + + def test_arrow_strings(self): + if pyarrow is not None: + a = pyarrow.array(["one", "two", "three", u"fo\u2014ur", "five"]) + assert awkward.arrow.view(a).tolist() == ["one", "two", "three", u"fo\u2014ur", "five"] + + def test_arrow_strings_null(self): + if pyarrow is not None: + a = pyarrow.array(["one", "two", None, u"fo\u2014ur", "five"]) + assert awkward.arrow.view(a).tolist() == ["one", "two", None, u"fo\u2014ur", "five"] + + def test_arrow_binary(self): + if pyarrow is not None: + a = pyarrow.array([b"one", b"two", b"three", b"four", b"five"]) + assert awkward.arrow.view(a).tolist() == [b"one", b"two", b"three", b"four", b"five"] + + def test_arrow_binary_null(self): + if pyarrow is not None: + a = pyarrow.array([b"one", b"two", None, b"four", b"five"]) + assert awkward.arrow.view(a).tolist() == [b"one", b"two", None, b"four", b"five"] + + def test_arrow_chunked_strings(self): + if pyarrow is not None: + a = pyarrow.chunked_array([pyarrow.array(["one", "two", "three", "four", "five"]), pyarrow.array(["six", "seven", "eight"])]) + assert awkward.arrow.view(a).tolist() == ["one", "two", "three", "four", "five", "six", "seven", "eight"] + + def test_arrow_nested_strings(self): + if pyarrow is not None: + a = pyarrow.array([["one", "two", "three"], [], ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", "three"], [], ["four", "five"]] + + def test_arrow_nested_strings_null(self): + if pyarrow is not None: + a = pyarrow.array([["one", "two", None], [], ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], ["four", "five"]] + + def test_arrow_null_nested_strings_null(self): + if pyarrow is not None: + a = pyarrow.array([["one", "two", None], [], None, ["four", "five"]]) + assert awkward.arrow.view(a).tolist() == [["one", "two", None], [], None, ["four", "five"]] + + def test_arrow_union_sparse(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, 2.2, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 2.2, 3.3, False] + + def test_arrow_union_sparse_null(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, True, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, None, 3.3, False] + + def test_arrow_union_sparse_null_null(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_sparse(pyarrow.array([0, 1, 0, 0, 1], type=pyarrow.int8()), [pyarrow.array([0.0, 1.1, None, 3.3, 4.4]), pyarrow.array([True, None, False, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, None, None, 3.3, False] + + def test_arrow_union_dense(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, 2.2, 3.3]), pyarrow.array([True, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, 2.2, 3.3, True, False] + + def test_arrow_union_dense_null(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, True, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, True, False] + + def test_arrow_union_dense_null_null(self): + if pyarrow is not None: + a = pyarrow.UnionArray.from_dense(pyarrow.array([0, 1, 0, 0, 0, 1, 1], type=pyarrow.int8()), pyarrow.array([0, 0, 1, 2, 3, 1, 2], type=pyarrow.int32()), [pyarrow.array([0.0, 1.1, None, 3.3]), pyarrow.array([True, None, False])]) + assert awkward.arrow.view(a).tolist() == [0.0, True, 1.1, None, 3.3, None, False] + + def test_arrow_dictarray(self): + if pyarrow is not None: + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", "two", "one", "three", "two", "two"] + + def test_arrow_dictarray_null(self): + if pyarrow is not None: + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, None, 1, None, 2, 1, 1]), pyarrow.array(["one", "two", "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", None, "two", None, "three", "two", "two"] + + def test_arrow_null_dictarray(self): + if pyarrow is not None: + a = pyarrow.DictionaryArray.from_arrays(pyarrow.array([0, 0, 2, 2, 1, 0, 2, 1, 1]), pyarrow.array(["one", None, "three"])) + assert awkward.arrow.view(a).tolist() == ["one", "one", "three", "three", None, "one", "three", None, None] + + def test_arrow_batch(self): + if pyarrow is not None: + a = pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"]) + assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d":{"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] + + def test_arrow_table(self): + if pyarrow is not None: + a = pyarrow.Table.from_batches([ + pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"]), + pyarrow.RecordBatch.from_arrays( + [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + pyarrow.array([[1, 2, 3], [], [4, 5], [None], [6]]), + pyarrow.array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([{"x": 1, "y": 1.1}, None, None, {"x": 4, "y": None}, {"x": 5, "y": 5.5}]), + pyarrow.array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}], [], [{"x": 4, "y": None}, {"x": 5, "y": 5.5}], [None], [{"x": 6, "y": 6.6}]])], + ["a", "b", "c", "d", "e"])]) + assert awkward.arrow.view(a).tolist() == [{"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}, {"a": 1.1, "b": [1, 2, 3], "c": {"x": 1, "y": 1.1}, "d": {"x": 1, "y": 1.1}, "e": [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}]}, {"a": 2.2, "b": [], "c": {"x": 2, "y": 2.2}, "d": None, "e": []}, {"a": 3.3, "b": [4, 5], "c": {"x": 3, "y": 3.3}, "d": None, "e": [{"x": 4, "y": None}, {"x": 5, "y": 5.5}]}, {"a": None, "b": [None], "c": {"x": 4, "y": None}, "d": {"x": 4, "y": None}, "e": [None]}, {"a": 5.5, "b": [6], "c": {"x": 5, "y": 5.5}, "d": {"x": 5, "y": 5.5}, "e": [{"x": 6, "y": 6.6}]}] + + def test_arrow_nonnullable_table(self): + if pyarrow is not None: + x = pyarrow.array([1, 2, 3]) + y = pyarrow.array([1.1, 2.2, 3.3]) + table = pyarrow.Table.from_arrays([x], ["x"]) + table2 = table.add_column(1, pyarrow.column(pyarrow.field("y", y.type, False), numpy.array([1.1, 2.2, 3.3]))) + assert awkward.arrow.view(table2).tolist() == [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 3, "y": 3.3}] + + # def test_arrow_writeparquet(self): + # if pyarrow is not None: + # a = pyarrow.Table.from_batches([ + # pyarrow.RecordBatch.from_arrays( + # [pyarrow.array([1.1, 2.2, 3.3, None, 5.5]), + # pyarrow.array([[1, 2, 3], [], [None], None, [4, 5, 6]]), + # pyarrow.array([[[1.1, 2.2]], None, [[3.3, None], []], [], [None, [4.4, 5.5]]])], + # ["a", "b", "c"]), + # pyarrow.RecordBatch.from_arrays( + # [pyarrow.array([2.2, 1.1, 3.3, None, 5.5]), + # pyarrow.array([[2, 1, 3], [], [None], None, [4, 5, 6]]), + # pyarrow.array([[[2.2, 1.1]], None, [[3.3, None], []], [], [None, [4.4, 5.5]]])], + # ["a", "b", "c"])]) + # writer = pyarrow.parquet.ParquetWriter("tests/samples/features-0_11_1.parquet", a.schema) + # writer.write_table(a) + # writer.write_table(a) + # writer.close() + + def test_arrow_readparquet(self): + if pyarrow is not None: + file = pyarrow.parquet.ParquetFile("tests/samples/features-0_11_1.parquet") +