Skip to content

Commit

Permalink
Merge branch 'main' into virtual-arrays
Browse files Browse the repository at this point in the history
  • Loading branch information
ikrommyd committed Jan 31, 2025
2 parents 4c36c7d + 3d14f2e commit 7c59dba
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 7 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/deploy-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ jobs:
run: ls -l dist/

- name: Generate artifact attestation for sdist and wheel
uses: actions/attest-build-provenance@7668571508540a607bdfd90a87a560489fe372eb # v2.1.0
uses: actions/attest-build-provenance@520d128f165991a6c774bcb264f323e3d70747f4 # v2.2.0
with:
subject-path: "dist/awkward*cpp-*"

- uses: pypa/[email protected].3
- uses: pypa/[email protected].4
4 changes: 2 additions & 2 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
run: pipx run twine check dist/*

- name: Generate artifact attestation for sdist and wheel
uses: actions/attest-build-provenance@7668571508540a607bdfd90a87a560489fe372eb # v2.1.0
uses: actions/attest-build-provenance@520d128f165991a6c774bcb264f323e3d70747f4 # v2.2.0
with:
subject-path: "dist/awkward-*"

Expand Down Expand Up @@ -135,7 +135,7 @@ jobs:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh attestation verify dist/awkward-*.whl --repo ${{ github.repository }}

- uses: pypa/[email protected].3
- uses: pypa/[email protected].4

publish-headers:
name: "Publish header-only libraries alongside release"
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ repos:
additional_dependencies: [pyyaml]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.2
rev: v0.9.3
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format

- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.1
hooks:
- id: codespell
args: ["-L", "ue,subjet,parms,fo,numer,thre,nin,nout"]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ build-backend = "hatchling.build"

[project]
name = "awkward"
version = "2.7.3"
version = "2.7.4"
description = "Manipulate JSON-like data with NumPy-like idioms."
license = { text = "BSD-3-Clause" }
requires-python = ">=3.9"
Expand Down
1 change: 1 addition & 0 deletions src/awkward/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,4 @@
from awkward.operations.ak_without_parameters import *
from awkward.operations.ak_zeros_like import *
from awkward.operations.ak_zip import *
from awkward.operations.ak_zip_no_broadcast import *
222 changes: 222 additions & 0 deletions src/awkward/operations/ak_zip_no_broadcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

from collections.abc import Mapping
from functools import reduce

import awkward as ak
from awkward._dispatch import high_level_function
from awkward._layout import HighLevelContext, ensure_same_backend
from awkward._namedaxis import _get_named_axis, _unify_named_axis
from awkward._nplikes.numpy_like import NumpyMetadata

__all__ = ("zip_no_broadcast",)

np = NumpyMetadata.instance()


@high_level_function()
def zip_no_broadcast(
arrays,
*,
parameters=None,
with_name=None,
highlevel=True,
behavior=None,
attrs=None,
):
"""
Args:
arrays (mapping or sequence of arrays): Each value in this mapping or
sequence can be any array-like data that #ak.to_layout recognizes.
parameters (None or dict): Parameters for the new
#ak.contents.RecordArray node that is created by this operation.
with_name (None or str): Assigns a `"__record__"` name to the new
#ak.contents.RecordArray node that is created by this operation
(overriding `parameters`, if necessary).
highlevel (bool): If True, return an #ak.Array; otherwise, return
a low-level #ak.contents.Content subclass.
behavior (None or dict): Custom #ak.behavior for the output array, if
high-level.
attrs (None or dict): Custom attributes for the output array, if
high-level.
Combines `arrays` into a single structure as the fields of a collection
of records or the slots of a collection of tuples.
Caution: unlike #ak.zip this function will _not_ broadcast the arrays together.
During typetracing, it assumes that the given arrays have already the same layouts and lengths.
This operation may be thought of as the opposite of projection in
#ak.Array.__getitem__, which extracts fields one at a time, or
#ak.unzip, which extracts them all in one call.
Consider the following arrays, `one` and `two`.
>>> one = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5], [6.6]])
>>> two = ak.Array([["a", "b", "c"], [], ["d", "e"], ["f"]])
Zipping them together using a dict creates a collection of records with
the same nesting structure as `one` and `two`.
>>> ak.zip_no_broadcast({"x": one, "y": two}).show()
[[{x: 1.1, y: 'a'}, {x: 2.2, y: 'b'}, {x: 3.3, y: 'c'}],
[],
[{x: 4.4, y: 'd'}],
[]]
Doing so with a list creates tuples, whose fields are not named.
>>> ak.zip_no_broadcast([one, two]).show()
[[(1.1, 'a'), (2.2, 'b'), (3.3, 'c')],
[],
[(4.4, 'd')],
[]]
See also #ak.zip and #ak.unzip.
"""
# Dispatch
if isinstance(arrays, Mapping):
yield arrays.values()
else:
yield arrays

# Implementation
return _impl(
arrays,
parameters,
with_name,
highlevel,
behavior,
attrs,
)


def _impl(
arrays,
parameters,
with_name,
highlevel,
behavior,
attrs,
):
with HighLevelContext(behavior=behavior, attrs=attrs) as ctx:
if isinstance(arrays, Mapping):
layouts = ensure_same_backend(
*(
ctx.unwrap(
x,
allow_record=False,
allow_unknown=False,
none_policy="pass-through",
primitive_policy="pass-through",
)
for x in arrays.values()
)
)
fields = list(arrays.keys())

# propagate named axis from input to output,
# use strategy "unify" (see: awkward._namedaxis)
out_named_axis = reduce(
_unify_named_axis, map(_get_named_axis, arrays.values())
)

else:
layouts = ensure_same_backend(
*(
ctx.unwrap(
x,
allow_record=False,
allow_unknown=False,
none_policy="pass-through",
primitive_policy="pass-through",
)
for x in arrays
)
)
fields = None

# propagate named axis from input to output,
# use strategy "unify" (see: awkward._namedaxis)
out_named_axis = reduce(_unify_named_axis, map(_get_named_axis, arrays))

# determine backend
backend = next((b.backend for b in layouts if hasattr(b, "backend")), "cpu")

if with_name is not None:
if parameters is None:
parameters = {}
else:
parameters = dict(parameters)
parameters["__record__"] = with_name

# only allow all NumpyArrays and ListOffsetArrays
if all(isinstance(layout, ak.contents.NumpyArray) for layout in layouts):
length = _check_equal_lengths(layouts)
out = ak.contents.RecordArray(
layouts, fields, length=length, parameters=parameters, backend=backend
)
elif all(isinstance(layout, ak.contents.ListOffsetArray) for layout in layouts):
contents = []
for layout in layouts:
# get the content of the ListOffsetArray
if not isinstance(layout.content, ak.contents.NumpyArray):
raise ValueError(
"can not (unsafe) zip ListOffsetArrays with non-NumpyArray contents"
)
contents.append(layout.content)

if backend.name == "typetracer":
# just get from the first one
# we're in typetracer mode, so we can't check the offsets (see else branch)
offsets = layouts[0].offsets
else:
# this is at 'runtime' with actual data, that means we can check the offsets,
# but only those that have actual data, i.e. no PlaceholderArrays
# so first, let's filter out any PlaceholderArrays
comparable_offsets = filter(
lambda o: not isinstance(o, ak._nplikes.placeholder.PlaceholderArray),
(layout.offsets for layout in layouts),
)
# check that offsets are the same
first = next(comparable_offsets)
if not all(
first.nplike.all(offsets.data == first.data)
for offsets in comparable_offsets
):
raise ValueError("all ListOffsetArrays must have the same offsets")
offsets = first

length = _check_equal_lengths(contents)
out = ak.contents.ListOffsetArray(
offsets=offsets,
content=ak.contents.RecordArray(
contents, fields, length=length, parameters=parameters, backend=backend
),
)
else:
raise ValueError(
"all array layouts must be either NumpyArrays or ListOffsetArrays"
)

# Unify named axes propagated through the broadcast
wrapped_out = ctx.wrap(out, highlevel=highlevel)
return ak.operations.ak_with_named_axis._impl(
wrapped_out,
named_axis=out_named_axis,
highlevel=highlevel,
behavior=ctx.behavior,
attrs=ctx.attrs,
)


def _check_equal_lengths(
contents: ak.contents.Content,
) -> int | ak._nplikes.shape.UnknownLength:
length = contents[0].length
for layout in contents:
if layout.length != length:
raise ValueError("all arrays must have the same length")
return length
58 changes: 58 additions & 0 deletions tests/test_3390_ak_zip_no_broadcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
# ruff: noqa: E402

from __future__ import annotations

import awkward as ak


def test_ak_zip_no_broadcast_NumpyArray_dict():
a = ak.Array([1])
b = ak.Array([2])
c = ak.zip_no_broadcast({"a": a, "b": b})
assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b}))


def test_ak_zip_no_broadcast_ListOffsetArray_dict():
a = ak.Array([[1], []])
b = ak.Array([[2], []])
c = ak.zip_no_broadcast({"a": a, "b": b})
assert ak.to_list(c) == ak.to_list(ak.zip({"a": a, "b": b}))


def test_ak_zip_no_broadcast_NumpyArray_list():
a = ak.Array([1])
b = ak.Array([2])
c = ak.zip_no_broadcast([a, b])
assert ak.to_list(c) == ak.to_list(ak.zip([a, b]))


def test_ak_zip_no_broadcast_ListOffsetArray_list():
a = ak.Array([[1], []])
b = ak.Array([[2], []])
c = ak.zip_no_broadcast([a, b])
assert ak.to_list(c) == ak.to_list(ak.zip([a, b]))


def test_typetracer_NumpyArray_non_touching():
tracer = ak.Array([1], backend="typetracer")

tracer, report = ak.typetracer.typetracer_with_report(
tracer.layout.form_with_key(), highlevel=True
)

_ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer})
assert len(report.shape_touched) == 1
assert len(report.data_touched) == 0


def test_typetracer_ListOffsetArray_non_touching():
tracer = ak.Array([[1], [], [2, 3]], backend="typetracer")

tracer, report = ak.typetracer.typetracer_with_report(
tracer.layout.form_with_key(), highlevel=True
)

_ = ak.zip_no_broadcast({"foo": tracer, "bar": tracer})
assert len(report.shape_touched) == 1
assert len(report.data_touched) == 0

0 comments on commit 7c59dba

Please sign in to comment.