From 00946bfbd0df8c6ce9e54bea181208642437b997 Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Thu, 30 Nov 2023 14:51:33 -0500 Subject: [PATCH] Prototype partial(?) replacements relation-tree objects. The current daf_relation tree in our Query objects is too contaminated with implementation details to serve us well in the RemoteButler client, and we could only partially mitigate that by defining a new daf_relation Engine. The fundamental issue is that daf_relation objects expect to know exactly which columns they have at any given time, and that's at odds with user expectations that columns "magically" appear whenever they're requested (e.g. referenced by a `where` string), and that this joins in the tables that provide them as-needed. The two new files here *heavily* duplicate stuff in daf_relation, and in addition to being more vague about what the columns are, they're simpler for two key reasons: - They're just data, with no logic for maintaining invariants, contructing trees, serialization, or anything else. This will have to change as we actually start to use them. - They fully enumerate the kinds of expressions and operations we care about in the butler query system rather than trying to define abstract versions of those upstream of daf_butler that could be specialized in daf_butler. I had not appreciated how much of a simplification this could be when writing daf_relation as a separate package, and if it holds up it may suggest that the right way to resolve the duplication is to rip a lot of stuff out of daf_relation. serialization. --- python/lsst/daf/butler/queries/__init__.py | 26 ++ .../butler/queries/abstract_expressions.py | 212 +++++++++++++ .../daf/butler/queries/abstract_relations.py | 282 ++++++++++++++++++ 3 files changed, 520 insertions(+) create mode 100644 python/lsst/daf/butler/queries/__init__.py create mode 100644 python/lsst/daf/butler/queries/abstract_expressions.py create mode 100644 python/lsst/daf/butler/queries/abstract_relations.py diff --git a/python/lsst/daf/butler/queries/__init__.py b/python/lsst/daf/butler/queries/__init__.py new file mode 100644 index 0000000000..36d1294e36 --- /dev/null +++ b/python/lsst/daf/butler/queries/__init__.py @@ -0,0 +1,26 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . diff --git a/python/lsst/daf/butler/queries/abstract_expressions.py b/python/lsst/daf/butler/queries/abstract_expressions.py new file mode 100644 index 0000000000..ee1160329a --- /dev/null +++ b/python/lsst/daf/butler/queries/abstract_expressions.py @@ -0,0 +1,212 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ( + "AbstractExpression", + "AbstractOrderExpression", + "AbstractPredicate", +) + + +import dataclasses +from typing import Literal, TypeAlias, Union, TYPE_CHECKING + +import astropy.time + +from lsst.sphgeom import Region + +from ..dimensions import DataCoordinate +from .._column_tags import DimensionKeyColumnTag, DimensionRecordColumnTag, DatasetColumnTag +from .._timespan import Timespan + +if TYPE_CHECKING: + from .abstract_relations import AbstractRelation + + +LiteralValue: TypeAlias = Union[int, bytes, str, float, astropy.time.Time, Timespan, Region] + + +@dataclasses.dataclass(frozen=True) +class ColumnLiteral: + """A column expression that is a literal Python value.""" + + value: LiteralValue + + +@dataclasses.dataclass(frozen=True) +class ColumnReference: + """A column expression that refers to a column obtainable from an abstract + relation. + """ + + column: DimensionKeyColumnTag | DimensionRecordColumnTag | DatasetColumnTag + + +@dataclasses.dataclass(frozen=True) +class UnaryExpression: + """A unary operation on a column expression that returns a non-bool.""" + + operand: AbstractExpression + operator: Literal["-", "begin_of", "end_of"] + + +@dataclasses.dataclass(frozen=True) +class BinaryExpression: + """A binary operation on column expressions that returns a non-bool.""" + + a: AbstractExpression + b: AbstractExpression + operator: Literal["+", "-", "*", "/", "%"] + + +AbstractExpression: TypeAlias = Union[ColumnLiteral, ColumnReference, UnaryExpression, BinaryExpression] + + +@dataclasses.dataclass(frozen=True) +class Reversed: + """A tag wrapper for `AbstractExpression` that indicate sorting in + reverse order. + """ + + operand: AbstractExpression + + +AbstractOrderExpression: TypeAlias = Union[AbstractExpression, Reversed] + + +@dataclasses.dataclass(frozen=True) +class LogicalAnd: + """A boolean column expression that is `True` only if all of its operands + are `True`. + """ + + operands: tuple[AbstractPredicate] + + +@dataclasses.dataclass(frozen=True) +class LogicalOr: + """A boolean column expression that is `True` if any of its operands are + `True`. + """ + + operands: tuple[AbstractPredicate] + + +@dataclasses.dataclass(frozen=True) +class LogicalNot: + """A boolean column expression that inverts its operand.""" + + operand: AbstractPredicate + + +@dataclasses.dataclass(frozen=True) +class IsNull: + """A boolean column expression that tests whether its operand is NULL.""" + + operand: AbstractExpression + + +@dataclasses.dataclass(frozen=True) +class Comparison: + """A boolean columns expression formed by comparing two non-boolean + expressions. + """ + + a: AbstractExpression + b: AbstractExpression + operator: Literal["=", "!=", "<", ">", ">=", "<=", "overlaps"] + + +@dataclasses.dataclass(frozen=True) +class InContainer: + """A boolean column expression that tests whether one expression is a + member of an explicit sequence of other expressions. + """ + + member: AbstractExpression + container: tuple[AbstractExpression, ...] + + +@dataclasses.dataclass(frozen=True) +class InRange: + """A boolean column expression that tests whether its expression is + included in an integer range. + """ + + member: AbstractExpression + range: range + + +@dataclasses.dataclass(frozen=True) +class InRelation: + """A boolean column expression that tests whether its expression is + included single-column projection of a relation. + + This is primarily intended to be used on dataset ID columns, but it may + be useful for other columns as well. + """ + + member: AbstractExpression + column: DimensionKeyColumnTag | DimensionRecordColumnTag | DatasetColumnTag + relation: AbstractRelation + + +@dataclasses.dataclass(frozen=True) +class StringPredicate: + """A tag wrapper for boolean column expressions created by parsing a string + expression. + + Remembering the original string is useful for error reporting. + """ + + where: str + tree: AbstractPredicate + + +@dataclasses.dataclass(frozen=True) +class DataCoordinateConstraint: + """A boolean column expression defined by interpreting data ID's key-value + pairs as a logical AND of equality constraints. + """ + + data_coordinate: DataCoordinate + + +AbstractPredicate: TypeAlias = Union[ + LogicalAnd, + LogicalOr, + LogicalNot, + IsNull, + Comparison, + InContainer, + InRange, + InRelation, + StringPredicate, + DataCoordinateConstraint, +] diff --git a/python/lsst/daf/butler/queries/abstract_relations.py b/python/lsst/daf/butler/queries/abstract_relations.py new file mode 100644 index 0000000000..892ae74691 --- /dev/null +++ b/python/lsst/daf/butler/queries/abstract_relations.py @@ -0,0 +1,282 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = ("AbstractRelation",) + + +import dataclasses +from types import EllipsisType +from typing import TypeAlias, Union, TYPE_CHECKING + +from ..dimensions import DataIdValue, DimensionGroup + +if TYPE_CHECKING: + from .abstract_expressions import AbstractPredicate, AbstractOrderExpression + + +class JoinTuple(tuple[str, str]): + """A 2-element `tuple` of `str` used to specify a spatial or temporal join. + + This is just a tuple whose elements are always in lexicographical order, + ensuring it can be put in `set` without the original order of those + elements mattering. + """ + + def __new__(cls, a: str, b: str) -> JoinTuple: + if a <= b: + return super().__new__(cls, (a, b)) # type: ignore + else: + return super().__new__(cls, (b, a)) # type: ignore + + @property + def a(self) -> str: + return self[0] + + @property + def b(self) -> str: + return self[1] + + +@dataclasses.dataclass(frozen=True) +class DatasetSearch: + """An abstract relation that represents a query for datasets.""" + + dataset_type: str | EllipsisType + """The name of the type of datasets returned by the query. + + ``...`` may be used to select all dataset types with the given + ``dimensions``, or all dataset types if ``dimensions`` is `None`. + """ + + collections: tuple[str, ...] + """The collections to search. + + Order matters if this dataset type is later referenced by a `FindFirst` + operation. Collection wildcards are always resolved before being included + in a dataset search. + """ + + dimensions: DimensionGroup | None + """The dimensions of the dataset type. + + If this is not `None`, the dimensions must match the actual dimensions of + the dataset type. If it is `None`, this search may include multiple + dataset types with different dimensions, but it will not be usable as an + operand in relation operations that require dimensions. + """ + + +@dataclasses.dataclass(frozen=True) +class DataCoordinateUpload: + """An abstract relation that represents (and holds) user-provided data + ID values. + """ + + dimensions: DimensionGroup + """The dimensions of the data IDs.""" + + rows: frozenset[tuple[DataIdValue, ...]] + """The required values of the data IDs.""" + + +@dataclasses.dataclass(frozen=True) +class DimensionJoin: + """An abstract relation that represents a join between dimension-element + tables and (optionally) other relations. + + Notes + ----- + Joins on dataset IDs are expected to be expressed as + `abstract_expressions.InRelation` predicates in `Selection` operations. + That is slightly more powerful (since it can do set differences via + `abstract_expressions.LogicalNot`) and it keeps the abstract relation tree + simpler if the only join constraints in play are on dimension columns. + """ + + dimensions: DimensionGroup + """The dimensions of the relation.""" + + operands: tuple[AbstractRelation, ...] = () + """Relations to include in the join other than dimension-element tables. + + Because dimension-element tables are expected to contain the full set of + values for their primary keys that could exist anywhere, they are only + actually joined in when resolving this abstract relation if they provide + a column or relationship not provided by one of these operands. For + example, if one operand is a `DatasetSearch` for a dataset with dimensions + ``{instrument, detector}``, and the dimensions here are + ``{instrument, physical_filter}``, there is no need to join in the + ``detector`` table, but the ``physical_filter`` table will be joined in. + + This may only include abstract relations whose dimensions are not `None`. + Relations whose dimensions are *empty* may be included. + """ + + spatial: frozenset[JoinTuple] = frozenset() + """Pairs of dimension element names that should whose regions on the sky + must overlap. + """ + + temporal: frozenset[JoinTuple] = frozenset() + """Pairs of dimension element names and calibration dataset type names + whose timespans must overlap. + """ + + +@dataclasses.dataclass(frozen=True) +class Selection: + """An abstract relation operation that filters out rows based on a + boolean expression. + """ + + operand: AbstractRelation + """Upstream relation to operate on.""" + + predicate: AbstractPredicate + """Boolean expression tree that defines the filter.""" + + @property + def dimensions(self) -> DimensionGroup | None: + """The dimensions of this abstract relation.""" + return self.operand.dimensions + + +@dataclasses.dataclass(frozen=True) +class DimensionProjection: + """An abstract relation operation that drops dimension columns from its + operand. + + Any dataset columns present are always preserved. + """ + + operand: AbstractRelation + """The upstream relation to operate on. + + This must have dimensions that are not `None`. + """ + + dimensions: DimensionGroup + """The dimensions of the new relation. + + This must be a subset of the original relation's dimensions. + """ + + +@dataclasses.dataclass(frozen=True) +class OrderedSlice: + """An abstract relation operation that sorts and/or integer-slices the rows + of its operand. + """ + + operand: AbstractRelation + """The upstream relation to operate on.""" + + order_by: tuple[AbstractOrderExpression, ...] = () + """Expressions to sort the rows by.""" + + begin: int = 0 + """Index of the first row to return.""" + + end: int | None = None + """Index one past the last row to return, or `None` for no bound.""" + + @property + def dimensions(self) -> DimensionGroup | None: + """The dimensions of this abstract relation.""" + return self.operand.dimensions + + +@dataclasses.dataclass(frozen=True) +class Chain: + """An abstract relation whose rows are the union of the rows of its + operands. + """ + + operands: tuple[AbstractRelation, ...] + """The upstream relations to combine. + + Order is not necessarily preserved. + """ + + dimensions: DimensionGroup | None + """The dimensions of all operands as well as the result.""" + + +@dataclasses.dataclass(frozen=True) +class FindFirst: + """An abstract relation that finds the first dataset for each data ID + in its ordered sequence of collections. + + This operation preserves all dimension columns but drops all dataset + columns other than those for its target dataset type. + """ + + operand: AbstractRelation + """The upstream relation to operate on. + + This may have more than one `DatasetSearch` joined into it (at any level), + as long as there is exactly one `DatasetSearch` for the ``dataset_type`` + of this operation. + """ + + dataset_type: str + """The type of the datasets being searched for.""" + + @property + def dimensions(self) -> DimensionGroup | None: + """The dimensions of this abstract relation.""" + return self.operand.dimensions + + +@dataclasses.dataclass(frozen=True) +class Materialization: + """An abstract relation that represent evaluating the upstream relation + and saving its rows somewhere (e.g. a temporary table or Parquet file). + """ + + operand: AbstractRelation + """The upstream relation to evaluate.""" + + @property + def dimensions(self) -> DimensionGroup | None: + """The dimensions of this abstract relation.""" + return self.operand.dimensions + + +AbstractRelation: TypeAlias = Union[ + DatasetSearch, + DataCoordinateUpload, + DimensionJoin, + Selection, + DimensionProjection, + OrderedSlice, + Chain, + FindFirst, + Materialization, +]