Skip to content

Commit

Permalink
WIP: new query system.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Jan 31, 2024
1 parent 27dec8f commit d4ad113
Show file tree
Hide file tree
Showing 21 changed files with 5,961 additions and 0 deletions.
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/_query_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,11 @@ def dataset_type(self) -> DatasetType:
"""
raise NotImplementedError()

@property
def dimensions(self) -> DimensionGroup:
"""The dimensions of the dataset type returned by this query."""
return self.dataset_type.dimensions.as_group()

@property
@abstractmethod
def data_ids(self) -> DataCoordinateQueryResults:
Expand Down
32 changes: 32 additions & 0 deletions python/lsst/daf/butler/queries/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ._base import *
from ._data_coordinate_query_results import *
from ._dataset_query_results import *
from ._dimension_record_query_results import *
from ._query import *
195 changes: 195 additions & 0 deletions python/lsst/daf/butler/queries/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("QueryBase", "HomogeneousQueryBase", "CountableQueryBase", "QueryResultsBase")

from abc import ABC, abstractmethod
from collections.abc import Iterable, Set
from typing import Any, Self

from ..dimensions import DimensionGroup
from .convert_args import convert_order_by_args
from .driver import QueryDriver
from .expression_factory import ExpressionProxy
from .tree import OrderExpression, QueryTree


class QueryBase(ABC):
@abstractmethod
def any(self, *, execute: bool = True, exact: bool = True) -> bool:
"""Test whether the query would return any rows.
Parameters
----------
execute : `bool`, optional
If `True`, execute at least a ``LIMIT 1`` query if it cannot be
determined prior to execution that the query would return no rows.
exact : `bool`, optional
If `True`, run the full query and perform post-query filtering if
needed, until at least one result row is found. If `False`, the
returned result does not account for post-query filtering, and
hence may be `True` even when all result rows would be filtered
out.
Returns
-------
any : `bool`
`True` if the query would (or might, depending on arguments) yield
result rows. `False` if it definitely would not.
"""
raise NotImplementedError()

@abstractmethod
def explain_no_results(self, execute: bool = True) -> Iterable[str]:
"""Return human-readable messages that may help explain why the query
yields no results.
Parameters
----------
execute : `bool`, optional
If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
of aspects of the tree to more precisely determine where rows were
filtered out.
Returns
-------
messages : `~collections.abc.Iterable` [ `str` ]
String messages that describe reasons the query might not yield any
results.
"""
raise NotImplementedError()


class HomogeneousQueryBase(QueryBase):
def __init__(self, driver: QueryDriver, tree: QueryTree):
self._driver = driver
self._tree = tree

@property
def dimensions(self) -> DimensionGroup:
"""All dimensions included in the query's columns."""
return self._tree.dimensions

def any(self, *, execute: bool = True, exact: bool = True) -> bool:
# Docstring inherited.
return self._driver.any(self._tree, execute=execute, exact=exact)

def explain_no_results(self, execute: bool = True) -> Iterable[str]:
# Docstring inherited.
return self._driver.explain_no_results(self._tree, execute=execute)


class CountableQueryBase(QueryBase):
@abstractmethod
def count(self, *, exact: bool = True, discard: bool = False) -> int:
"""Count the number of rows this query would return.
Parameters
----------
exact : `bool`, optional
If `True`, run the full query and perform post-query filtering if
needed to account for that filtering in the count. If `False`, the
result may be an upper bound.
discard : `bool`, optional
If `True`, compute the exact count even if it would require running
the full query and then throwing away the result rows after
counting them. If `False`, this is an error, as the user would
usually be better off executing the query first to fetch its rows
into a new query (or passing ``exact=False``). Ignored if
``exact=False``.
Returns
-------
count : `int`
The number of rows the query would return, or an upper bound if
``exact=False``.
"""
raise NotImplementedError()


class QueryResultsBase(HomogeneousQueryBase, CountableQueryBase):
def order_by(self, *args: str | OrderExpression | ExpressionProxy) -> Self:
"""Return a new query that yields ordered results.
Parameters
----------
*args : `str`
Names of the columns/dimensions to use for ordering. Column name
can be prefixed with minus (``-``) to use descending ordering.
Returns
-------
result : `QueryResultsBase`
An ordered version of this query results object.
Notes
-----
If this method is called multiple times, the new sort terms replace
the old ones.
"""
return self._copy(
self._tree, order_by=convert_order_by_args(self.dimensions, self._get_datasets(), *args)
)

def limit(self, limit: int | None = None, offset: int = 0) -> Self:
"""Return a new query that slices its result rows positionally.
Parameters
----------
limit : `int` or `None`, optional
Upper limit on the number of returned records.
offset : `int`, optional
The number of records to skip before returning at most ``limit``
records.
Returns
-------
result : `QueryResultsBase`
A sliced version of this query results object.
Notes
-----
If this method is called multiple times, the new slice parameters
replace the old ones. Slicing always occurs after sorting, even if
`limit` is called before `order_by`.
"""
return self._copy(self._tree, limit=limit, offset=offset)

@abstractmethod
def _get_datasets(self) -> Set[str]:
"""Return all dataset types included in the query's result rows."""
raise NotImplementedError()

@abstractmethod
def _copy(self, tree: QueryTree, **kwargs: Any) -> Self:
"""Return a modified copy of ``self``.
Modifications should be validated, not assumed to be correct.
"""
raise NotImplementedError()
142 changes: 142 additions & 0 deletions python/lsst/daf/butler/queries/_data_coordinate_query_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("DataCoordinateQueryResults",)

from collections.abc import Iterable, Iterator
from typing import TYPE_CHECKING, Any

from ..dimensions import DataCoordinate, DimensionGroup
from ._base import QueryResultsBase
from .driver import QueryDriver
from .tree import InvalidQueryTreeError, QueryTree

if TYPE_CHECKING:
from .result_specs import DataCoordinateResultSpec


class DataCoordinateQueryResults(QueryResultsBase):
"""A method-chaining builder for butler queries that return data IDs.
Parameters
----------
driver : `QueryDriver`
Implementation object that knows how to actually execute queries.
tree : `QueryTree`
Description of the query as a tree of joins and column expressions.
The instance returned directly by the `Butler._query` entry point
should be constructed via `make_unit_query_tree`.
spec : `DataCoordinateResultSpec`
Specification of the query result rows, including output columns,
ordering, and slicing.
Notes
-----
This refines the `DataCoordinateQueryResults` ABC defined in
`lsst.daf.butler._query_results`, but the intent is to replace that ABC
with this concrete class, rather than inherit from it.
"""

def __init__(self, driver: QueryDriver, tree: QueryTree, spec: DataCoordinateResultSpec):
spec.validate_tree(tree)
super().__init__(driver, tree)
self._spec = spec

def __iter__(self) -> Iterator[DataCoordinate]:
page = self._driver.execute(self._spec, self._tree)
yield from page.rows
while page.next_key is not None:
page = self._driver.fetch_next_page(self._spec, page.next_key)
yield from page.rows

@property
def has_dimension_records(self) -> bool:
"""Whether all data IDs in this iterable contain dimension records."""
return self._spec.include_dimension_records

def with_dimension_records(self) -> DataCoordinateQueryResults:
"""Return a results object for which `has_dimension_records` is
`True`.
"""
if self.has_dimension_records:
return self
return self._copy(tree=self._tree, include_dimension_records=True)

def subset(
self,
dimensions: DimensionGroup | Iterable[str] | None = None,
) -> DataCoordinateQueryResults:
"""Return a results object containing a subset of the dimensions of
this one.
Parameters
----------
dimensions : `DimensionGroup` or \
`~collections.abc.Iterable` [ `str`], optional
Dimensions to include in the new results object. If `None`,
``self.dimensions`` is used.
Returns
-------
results : `DataCoordinateQueryResults`
A results object corresponding to the given criteria. May be
``self`` if it already qualifies.
Raises
------
InvalidQueryTreeError
Raised when ``dimensions`` is not a subset of the dimensions in
this result.
"""
if dimensions is None:
dimensions = self.dimensions
else:
dimensions = self._driver.universe.conform(dimensions)
if not dimensions <= self.dimensions:
raise InvalidQueryTreeError(
f"New dimensions {dimensions} are not a subset of the current "
f"dimensions {self.dimensions}."
)
return self._copy(tree=self._tree, dimensions=dimensions)

def count(self, *, exact: bool = True, discard: bool = False) -> int:
# Docstring inherited.
return self._driver.count(
self._tree,
self._spec.get_result_columns(),
find_first_dataset=None,
exact=exact,
discard=discard,
)

def _copy(self, tree: QueryTree, **kwargs: Any) -> DataCoordinateQueryResults:
return DataCoordinateQueryResults(self._driver, tree, spec=self._spec.model_copy(update=kwargs))

def _get_datasets(self) -> frozenset[str]:
return frozenset()
Loading

0 comments on commit d4ad113

Please sign in to comment.