Merge branch 'main' into virtual-arrays

scikit-hep · Jan 31, 2025 · 7c59dba · 7c59dba
2 parents 4c36c7d + 3d14f2e
commit 7c59dba
Show file tree

Hide file tree

Showing 7 changed files with 288 additions and 7 deletions.
diff --git a/.github/workflows/deploy-cpp.yml b/.github/workflows/deploy-cpp.yml
@@ -35,8 +35,8 @@ jobs:
       run: ls -l dist/
 
     - name: Generate artifact attestation for sdist and wheel
-      uses: actions/attest-build-provenance@7668571508540a607bdfd90a87a560489fe372eb # v2.1.0
+      uses: actions/attest-build-provenance@520d128f165991a6c774bcb264f323e3d70747f4 # v2.2.0
       with:
         subject-path: "dist/awkward*cpp-*"
 
-    - uses: pypa/[email protected].3
+    - uses: pypa/[email protected].4
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -88,7 +88,7 @@ jobs:
       run: pipx run twine check dist/*
 
     - name: Generate artifact attestation for sdist and wheel
-      uses: actions/attest-build-provenance@7668571508540a607bdfd90a87a560489fe372eb # v2.1.0
+      uses: actions/attest-build-provenance@520d128f165991a6c774bcb264f323e3d70747f4 # v2.2.0
       with:
         subject-path: "dist/awkward-*"
 
@@ -135,7 +135,7 @@ jobs:
         GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       run: gh attestation verify dist/awkward-*.whl --repo ${{ github.repository }}
 
-    - uses: pypa/[email protected].3
+    - uses: pypa/[email protected].4
 
   publish-headers:
     name: "Publish header-only libraries alongside release"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,14 +27,14 @@ repos:
     additional_dependencies: [pyyaml]
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.9.2
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: ["--fix", "--show-fixes"]
   - id: ruff-format
 
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.1
   hooks:
   - id: codespell
     args: ["-L", "ue,subjet,parms,fo,numer,thre,nin,nout"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "awkward"
-version = "2.7.3"
+version = "2.7.4"
 description = "Manipulate JSON-like data with NumPy-like idioms."
 license = { text = "BSD-3-Clause" }
 requires-python = ">=3.9"

diff --git a/src/awkward/operations/__init__.py b/src/awkward/operations/__init__.py
@@ -123,3 +123,4 @@
 from awkward.operations.ak_without_parameters import *
 from awkward.operations.ak_zeros_like import *
 from awkward.operations.ak_zip import *
+from awkward.operations.ak_zip_no_broadcast import *
diff --git a/src/awkward/operations/ak_zip_no_broadcast.py b/src/awkward/operations/ak_zip_no_broadcast.py
@@ -0,0 +1,222 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from functools import reduce
+
+import awkward as ak
+from awkward._dispatch import high_level_function
+from awkward._layout import HighLevelContext, ensure_same_backend
+from awkward._namedaxis import _get_named_axis, _unify_named_axis
+from awkward._nplikes.numpy_like import NumpyMetadata
+
+__all__ = ("zip_no_broadcast",)
+
+np = NumpyMetadata.instance()
+
+
+@high_level_function()
+def zip_no_broadcast(
+    arrays,
+    *,
+    parameters=None,
+    with_name=None,
+    highlevel=True,
+    behavior=None,
+    attrs=None,
+):
+    """
+    Args:
+        arrays (mapping or sequence of arrays): Each value in this mapping or
+            sequence can be any array-like data that #ak.to_layout recognizes.
+        parameters (None or dict): Parameters for the new
+            #ak.contents.RecordArray node that is created by this operation.
+        with_name (None or str): Assigns a `"__record__"` name to the new
+            #ak.contents.RecordArray node that is created by this operation
+            (overriding `parameters`, if necessary).
+        highlevel (bool): If True, return an #ak.Array; otherwise, return
+            a low-level #ak.contents.Content subclass.
+        behavior (None or dict): Custom #ak.behavior for the output array, if
+            high-level.
+        attrs (None or dict): Custom attributes for the output array, if
+            high-level.
+
+    Combines `arrays` into a single structure as the fields of a collection
+    of records or the slots of a collection of tuples.
+
+    Caution: unlike #ak.zip this function will _not_ broadcast the arrays together.
+    During typetracing, it assumes that the given arrays have already the same layouts and lengths.
+
+    This operation may be thought of as the opposite of projection in
+    #ak.Array.__getitem__, which extracts fields one at a time, or
+    #ak.unzip, which extracts them all in one call.
+
+    Consider the following arrays, `one` and `two`.
+
+        >>> one = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5], [6.6]])
+        >>> two = ak.Array([["a", "b", "c"], [], ["d", "e"], ["f"]])
+
+    Zipping them together using a dict creates a collection of records with
+    the same nesting structure as `one` and `two`.
+
+        >>> ak.zip_no_broadcast({"x": one, "y": two}).show()
+        [[{x: 1.1, y: 'a'}, {x: 2.2, y: 'b'}, {x: 3.3, y: 'c'}],
+         [],
+         [{x: 4.4, y: 'd'}],
+         []]
+
+    Doing so with a list creates tuples, whose fields are not named.
+
+        >>> ak.zip_no_broadcast([one, two]).show()
+        [[(1.1, 'a'), (2.2, 'b'), (3.3, 'c')],
+         [],
+         [(4.4, 'd')],
+         []]
+
+    See also #ak.zip and #ak.unzip.
+    """
+    # Dispatch
+    if isinstance(arrays, Mapping):
+        yield arrays.values()
+    else:
+        yield arrays
+
+    # Implementation
+    return _impl(
+        arrays,
+        parameters,
+        with_name,
+        highlevel,
+        behavior,
+        attrs,
+    )
+
+
+def _impl(
+    arrays,
+    parameters,
+    with_name,
+    highlevel,
+    behavior,
+    attrs,
+):
+    with HighLevelContext(behavior=behavior, attrs=attrs) as ctx:
+        if isinstance(arrays, Mapping):
+            layouts = ensure_same_backend(
+                *(
+                    ctx.unwrap(
+                        x,
+                        allow_record=False,
+                        allow_unknown=False,
+                        none_policy="pass-through",
+                        primitive_policy="pass-through",
+                    )
+                    for x in arrays.values()
+                )
+            )
+            fields = list(arrays.keys())
+
+            # propagate named axis from input to output,
+            #   use strategy "unify" (see: awkward._namedaxis)
+            out_named_axis = reduce(
+                _unify_named_axis, map(_get_named_axis, arrays.values())
+            )
+
+        else:
+            layouts = ensure_same_backend(
+                *(
+                    ctx.unwrap(
+                        x,
+                        allow_record=False,
+                        allow_unknown=False,
+                        none_policy="pass-through",
+                        primitive_policy="pass-through",
+                    )
+                    for x in arrays
+                )
+            )
+            fields = None
+
+            # propagate named axis from input to output,
+            #   use strategy "unify" (see: awkward._namedaxis)
+            out_named_axis = reduce(_unify_named_axis, map(_get_named_axis, arrays))
+
+    # determine backend
+    backend = next((b.backend for b in layouts if hasattr(b, "backend")), "cpu")
+
+    if with_name is not None:
+        if parameters is None:
+            parameters = {}
+        else:
+            parameters = dict(parameters)
+        parameters["__record__"] = with_name
+
+    # only allow all NumpyArrays and ListOffsetArrays
+    if all(isinstance(layout, ak.contents.NumpyArray) for layout in layouts):
+        length = _check_equal_lengths(layouts)
+        out = ak.contents.RecordArray(
+            layouts, fields, length=length, parameters=parameters, backend=backend
+        )
+    elif all(isinstance(layout, ak.contents.ListOffsetArray) for layout in layouts):
+        contents = []
+        for layout in layouts:
+            # get the content of the ListOffsetArray
+            if not isinstance(layout.content, ak.contents.NumpyArray):
+                raise ValueError(
+                    "can not (unsafe) zip ListOffsetArrays with non-NumpyArray contents"
+                )
+            contents.append(layout.content)
+
+        if backend.name == "typetracer":
+            # just get from the first one
+            # we're in typetracer mode, so we can't check the offsets (see else branch)
+            offsets = layouts[0].offsets
+        else:
+            # this is at 'runtime' with actual data, that means we can check the offsets,
+            # but only those that have actual data, i.e. no PlaceholderArrays
+            # so first, let's filter out any PlaceholderArrays
+            comparable_offsets = filter(
+                lambda o: not isinstance(o, ak._nplikes.placeholder.PlaceholderArray),
+                (layout.offsets for layout in layouts),
+            )
+            # check that offsets are the same
+            first = next(comparable_offsets)
+            if not all(
+                first.nplike.all(offsets.data == first.data)
+                for offsets in comparable_offsets
+            ):
+                raise ValueError("all ListOffsetArrays must have the same offsets")
+            offsets = first
+
+        length = _check_equal_lengths(contents)
+        out = ak.contents.ListOffsetArray(
+            offsets=offsets,
+            content=ak.contents.RecordArray(
+                contents, fields, length=length, parameters=parameters, backend=backend
+            ),
+        )
+    else:
+        raise ValueError(
+            "all array layouts must be either NumpyArrays or ListOffsetArrays"
+        )
+
+    # Unify named axes propagated through the broadcast
+    wrapped_out = ctx.wrap(out, highlevel=highlevel)
+    return ak.operations.ak_with_named_axis._impl(
+        wrapped_out,
+        named_axis=out_named_axis,
+        highlevel=highlevel,
+        behavior=ctx.behavior,
+        attrs=ctx.attrs,
+    )
+
+
+def _check_equal_lengths(
+    contents: ak.contents.Content,
+) -> int | ak._nplikes.shape.UnknownLength:
+    length = contents[0].length
+    for layout in contents:
+        if layout.length != length:
+            raise ValueError("all arrays must have the same length")
+    return length
diff --git a/tests/test_3390_ak_zip_no_broadcast.py b/tests/test_3390_ak_zip_no_broadcast.py
@@ -0,0 +1,58 @@
+# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
+# ruff: noqa: E402
+
+from __future__ import annotations
+
+import awkward as ak
+
+
+def test_ak_zip_no_broadcast_NumpyArray_dict():
+    a = ak.Array([1])
+    b = ak.Array([2])
+    c = ak.zip_no_broadcast({"a": a, "b": b})
+    assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b}))
+
+
+def test_ak_zip_no_broadcast_ListOffsetArray_dict():
+    a = ak.Array([[1], []])
+    b = ak.Array([[2], []])
+    c = ak.zip_no_broadcast({"a": a, "b": b})
+    assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b}))
+
+
+def test_ak_zip_no_broadcast_NumpyArray_list():
+    a = ak.Array([1])
+    b = ak.Array([2])
+    c = ak.zip_no_broadcast([a, b])
+    assert ak.to_list(c) == ak.to_list(ak.zip([a, b]))
+
+
+def test_ak_zip_no_broadcast_ListOffsetArray_list():
+    a = ak.Array([[1], []])
+    b = ak.Array([[2], []])
+    c = ak.zip_no_broadcast([a, b])
+    assert ak.to_list(c) == ak.to_list(ak.zip([a, b]))
+
+
+def test_typetracer_NumpyArray_non_touching():
+    tracer = ak.Array([1], backend="typetracer")
+
+    tracer, report = ak.typetracer.typetracer_with_report(
+        tracer.layout.form_with_key(), highlevel=True
+    )
+
+    _ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer})
+    assert len(report.shape_touched) == 1
+    assert len(report.data_touched) == 0
+
+
+def test_typetracer_ListOffsetArray_non_touching():
+    tracer = ak.Array([[1], [], [2, 3]], backend="typetracer")
+
+    tracer, report = ak.typetracer.typetracer_with_report(
+        tracer.layout.form_with_key(), highlevel=True
+    )
+
+    _ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer})
+    assert len(report.shape_touched) == 1
+    assert len(report.data_touched) == 0