-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #565 from pfackeldey/pfackeldey/manual_column_opti…
…mization feat: add possibility to manually perform the column projection
- Loading branch information
Showing
7 changed files
with
151 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from dask_awkward.manual.column_optimization import optimize_columns |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from __future__ import annotations | ||
|
||
from typing import cast | ||
|
||
from dask.highlevelgraph import HighLevelGraph | ||
|
||
from dask_awkward.layers.layers import AwkwardInputLayer | ||
from dask_awkward.lib.core import Array | ||
|
||
|
||
def optimize_columns(array: Array, columns: dict[str, frozenset[str]]) -> Array: | ||
""" | ||
Manually updates the AwkwardInputLayer(s) with the specified columns. This is useful | ||
for tracing the necessary buffers for a given computation once, and then reusing the | ||
typetracer reports to touch only the necessary columns for other datasets. | ||
Calling this function will update the `AwkwardInputLayer`'s `necessary_columns` attribute, | ||
i.e. pruning the columns that are not wanted. This replaces the automatic column optimization, | ||
which is why one should be careful when using this function combined with `.compute(optimize_graph=True)`. | ||
Parameters | ||
---------- | ||
array : Array | ||
The dask-awkward array to be optimized. | ||
columns : dict[str, frozenset[str]] | ||
The columns to be touched. | ||
Returns | ||
------- | ||
Array | ||
A new Dask-Awkward array with only the specified columns. | ||
""" | ||
if not isinstance(array, Array): | ||
raise TypeError( | ||
f"Expected `dak_array` to be of type `dask_awkward.Array`, got {type(array)}" | ||
) | ||
|
||
dsk = array.dask | ||
layers = dict(dsk.layers) | ||
deps = dict(dsk.dependencies) | ||
|
||
for name, cols in columns.items(): | ||
io_layer = cast(AwkwardInputLayer, layers[name]) | ||
if not isinstance(io_layer, AwkwardInputLayer): | ||
raise TypeError( | ||
f"Expected layer {name} to be of type `dask_awkward.layers.AwkwardInputLayer`, got {type(io_layer)}" | ||
) | ||
projected_layer = io_layer.project_manually(columns=cols) | ||
|
||
# explicitely disable 'project-ability' now, since we did this manually just now | ||
# Is there a better way to do this? Because this disables the possibility to chain call `dak.manual.optimize_columns` | ||
projected_layer.is_projectable = False | ||
|
||
layers[name] = projected_layer | ||
|
||
new_dsk = HighLevelGraph(layers, deps) | ||
return array._rebuild(dsk=new_dsk) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from __future__ import annotations | ||
|
||
import awkward as ak | ||
import numpy as np | ||
import pytest | ||
|
||
import dask_awkward as dak | ||
|
||
|
||
def test_optimize_columns(): | ||
pytest.importorskip("pyarrow") | ||
pytest.importorskip("requests") | ||
pytest.importorskip("aiohttp") | ||
|
||
array = dak.from_parquet( | ||
"https://github.com/scikit-hep/awkward/raw/main/tests/samples/nullable-record-primitives-simple.parquet" | ||
) | ||
|
||
needs = dak.inspect.report_necessary_columns(array.u4) | ||
only_u4_array = dak.manual.optimize_columns(array, needs) | ||
|
||
assert only_u4_array.fields == ["u4", "u8"] | ||
|
||
materialized_only_u4_array = only_u4_array.compute() | ||
|
||
# u4 is materialized, u8 is not | ||
assert isinstance( | ||
materialized_only_u4_array.layout.content("u4").content.data, np.ndarray | ||
) | ||
assert isinstance( | ||
materialized_only_u4_array.layout.content("u8").content.data, | ||
ak._nplikes.placeholder.PlaceholderArray, | ||
) | ||
|
||
# now again, but we add 'u8' by hand to the columns | ||
key, cols = needs.popitem() | ||
cols |= {"u8"} | ||
|
||
needs = {key: cols} | ||
|
||
u4_and_u8_array = dak.manual.optimize_columns(array, needs) | ||
|
||
assert u4_and_u8_array.fields == ["u4", "u8"] | ||
|
||
materialized_u4_and_u8_array = u4_and_u8_array.compute() | ||
|
||
# now u4 and u8 are materialized | ||
assert isinstance( | ||
materialized_u4_and_u8_array.layout.content("u4").content.data, np.ndarray | ||
) | ||
assert isinstance( | ||
materialized_u4_and_u8_array.layout.content("u8").content.data, np.ndarray | ||
) |