Rewrite and disable Butler._query interfaces.

This includes: - replacing the Query and *QueryResults ABCs with concrete classes that delegate to another QueryDriver ABC; - substantially reworking the public Query and *QueryResults interfaces, mostly to minimize the number of different ways to do various things (and hence limit complexity); - adding a large suite of Pydantic models that can describe complex under-construction queries, allowing us to send them over the wire in RemoteButler. Because QueryDriver doesn't have any concrete implementations yet, this change means Butler._query no longer works at (previously it delegated to the old registry.queries system). A QueryDriver implementation for DirectButler has been largely implemented on another branch and will be added later. For now, the only tests are those that rely on a mocked QueryDriver (or don't require one at all). These are in two files: - test_query_interfaces.py tests the public interface objects, including the semi-public Pydantic models; - test_query_utilities.py tests some utility classes (ColumnSet and OverlapsVisitor) that are expected to be used by all driver implementations to establish some behavioral invariants. There is already substantial duplication with code in lsst.daf.butler.registry.queries, and that will get worse when a direct-SQL driver class is added. Eventually the plan is to retire almost all of lsst.daf.butler.registry.queries (except the string-expression parser, which we'll move later) making the public registry query interfaces delegate to lsst.daf.butler.queries instead, but that will require both getting the latter fully functional and RFC'ing the removal of some things we have no intention of doing in the new system.
lsst · Feb 26, 2024 · 97d3e88 · 97d3e88
1 parent 8672db3
commit 97d3e88
Show file tree

Hide file tree

Showing 31 changed files with 8,840 additions and 1,608 deletions.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -60,8 +60,6 @@
 from ._named import *
 from ._quantum import *
 from ._quantum_backed import *
-from ._query import *
-from ._query_results import *
 from ._storage_class import *
 from ._storage_class_delegate import *
 from ._timespan import *
@@ -80,6 +78,16 @@
 # Only lift 'Progress' from 'progess'; the module is imported as-is above
 from .progress import Progress
 
+# Only import the main public symbols from queries
+from .queries import (
+    ChainedDatasetQueryResults,
+    DataCoordinateQueryResults,
+    DatasetQueryResults,
+    DimensionRecordQueryResults,
+    Query,
+    SingleTypeDatasetQueryResults,
+)
+
 # Do not import or lift symbols from 'server' or 'server_models'.
 # Import the registry subpackage directly for other symbols.
 from .registry import (

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -32,16 +32,19 @@
 from abc import abstractmethod
 from collections.abc import Collection, Iterable, Mapping, Sequence
 from contextlib import AbstractContextManager
+from types import EllipsisType
 from typing import TYPE_CHECKING, Any, TextIO
 
 from lsst.resources import ResourcePath, ResourcePathExpression
 from lsst.utils import doImportType
+from lsst.utils.iteration import ensure_iterable
 from lsst.utils.logging import getLogger
 
 from ._butler_config import ButlerConfig, ButlerType
 from ._butler_instance_options import ButlerInstanceOptions
 from ._butler_repo_index import ButlerRepoIndex
 from ._config import Config, ConfigSubset
+from ._exceptions import EmptyQueryResultError
 from ._limited_butler import LimitedButler
 from .datastore import Datastore
 from .dimensions import DimensionConfig
@@ -54,12 +57,12 @@
     from ._dataset_type import DatasetType
     from ._deferredDatasetHandle import DeferredDatasetHandle
     from ._file_dataset import FileDataset
-    from ._query import Query
     from ._storage_class import StorageClass
     from ._timespan import Timespan
     from .datastore import DatasetRefURIs
     from .dimensions import DataCoordinate, DataId, DimensionGroup, DimensionRecord
-    from .registry import CollectionArgType, Registry
+    from .queries import Query
+    from .registry import Registry
     from .transfers import RepoExportContext
 
 _LOG = getLogger(__name__)
@@ -1428,15 +1431,14 @@ def _query(self) -> AbstractContextManager[Query]:
         """
         raise NotImplementedError()
 
-    @abstractmethod
     def _query_data_ids(
         self,
         dimensions: DimensionGroup | Iterable[str] | str,
         *,
         data_id: DataId | None = None,
         where: str = "",
         bind: Mapping[str, Any] | None = None,
-        expanded: bool = False,
+        with_dimension_records: bool = False,
         order_by: Iterable[str] | str | None = None,
         limit: int | None = None,
         offset: int = 0,
@@ -1466,7 +1468,7 @@ def _query_data_ids(
             Values of collection type can be expanded in some cases; see
             :ref:`daf_butler_dimension_expressions_identifiers` for more
             information.
-        expanded : `bool`, optional
+        with_dimension_records : `bool`, optional
             If `True` (default is `False`) then returned data IDs will have
             dimension records.
         order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
@@ -1511,19 +1513,32 @@ def _query_data_ids(
             Raised when the arguments are incompatible, e.g. ``offset`` is
             specified, but ``limit`` is not.
         """
-        raise NotImplementedError()
+        if data_id is None:
+            data_id = DataCoordinate.make_empty(self.dimensions)
+        with self._query() as query:
+            result = (
+                query.where(data_id, where, bind=bind, **kwargs)
+                .data_ids(dimensions)
+                .order_by(*ensure_iterable(order_by))
+                .limit(limit, offset)
+            )
+            if with_dimension_records:
+                result = result.with_dimension_records()
+            data_ids = list(result)
+        if explain and not data_ids:
+            raise EmptyQueryResultError(list(result.explain_no_results()))
+        return data_ids
 
-    @abstractmethod
     def _query_datasets(
         self,
-        dataset_type: Any,
-        collections: CollectionArgType | None = None,
+        dataset_type: str | Iterable[str] | DatasetType | Iterable[DatasetType] | EllipsisType,
+        collections: str | Iterable[str] | None = None,
         *,
         find_first: bool = True,
         data_id: DataId | None = None,
         where: str = "",
         bind: Mapping[str, Any] | None = None,
-        expanded: bool = False,
+        with_dimension_records: bool = False,
         explain: bool = True,
         **kwargs: Any,
     ) -> list[DatasetRef]:
@@ -1533,17 +1548,13 @@ def _query_datasets(
         ----------
         dataset_type : dataset type expression
             An expression that fully or partially identifies the dataset types
-            to be queried.  Allowed types include `DatasetType`, `str`,
-            `re.Pattern`, and iterables thereof.  The special value ``...`` can
-            be used to query all dataset types.  See
-            :ref:`daf_butler_dataset_type_expressions` for more information.
+            to be queried.  Allowed types include `DatasetType`, `str`, and
+            iterables thereof.  The special value ``...`` can be used to query
+            all dataset types.  See :ref:`daf_butler_dataset_type_expressions`
+            for more information.
         collections : collection expression, optional
-            An expression that identifies the collections to search, such as a
-            `str` (for full matches or partial matches via globs), `re.Pattern`
-            (for partial matches), or iterable thereof.  ``...`` can be used to
-            search all collections (actually just all `~CollectionType.RUN`
-            collections, because this will still find all datasets).
-            If not provided, the default collections are used.  See
+            A collection name or iterable of collection names to search. If not
+            provided, the default collections are used.  See
             :ref:`daf_butler_collection_expressions` for more information.
         find_first : `bool`, optional
             If `True` (default), for each result data ID, only yield one
@@ -1552,20 +1563,20 @@ def _query_datasets(
             order of ``collections`` passed in).  If `True`, ``collections``
             must not contain regular expressions and may not be ``...``.
         data_id : `dict` or `DataCoordinate`, optional
-            A data ID whose key-value pairs are used as equality constraints
-            in the query.
+            A data ID whose key-value pairs are used as equality constraints in
+            the query.
         where : `str`, optional
-            A string expression similar to a SQL WHERE clause.  May involve
-            any column of a dimension table or (as a shortcut for the primary
-            key column of a dimension table) dimension name.  See
+            A string expression similar to a SQL WHERE clause.  May involve any
+            column of a dimension table or (as a shortcut for the primary key
+            column of a dimension table) dimension name.  See
             :ref:`daf_butler_dimension_expressions` for more information.
         bind : `~collections.abc.Mapping`, optional
             Mapping containing literal values that should be injected into the
-            ``where`` expression, keyed by the identifiers they replace.
-            Values of collection type can be expanded in some cases; see
+            ``where`` expression, keyed by the identifiers they replace. Values
+            of collection type can be expanded in some cases; see
             :ref:`daf_butler_dimension_expressions_identifiers` for more
             information.
-        expanded : `bool`, optional
+        with_dimension_records : `bool`, optional
             If `True` (default is `False`) then returned data IDs will have
             dimension records.
         explain : `bool`, optional
@@ -1586,8 +1597,8 @@ def _query_datasets(
             IDs are guaranteed to include values for all implied dimensions
             (i.e. `DataCoordinate.hasFull` will return `True`), but will not
             include dimension records (`DataCoordinate.hasRecords` will be
-            `False`) unless `~.queries.DatasetQueryResults.expanded` is
-            called on the result object (which returns a new one).
+            `False`) unless `~.queries.DatasetQueryResults.expanded` is called
+            on the result object (which returns a new one).
 
         Raises
         ------
@@ -1609,14 +1620,26 @@ def _query_datasets(
 
         Notes
         -----
-        When multiple dataset types are queried in a single call, the
-        results of this operation are equivalent to querying for each dataset
-        type separately in turn, and no information about the relationships
-        between datasets of different types is included.
+        When multiple dataset types are queried in a single call, the results
+        of this operation are equivalent to querying for each dataset type
+        separately in turn, and no information about the relationships between
+        datasets of different types is included.
         """
-        raise NotImplementedError()
+        if data_id is None:
+            data_id = DataCoordinate.make_empty(self.dimensions)
+        with self._query() as query:
+            result = query.where(data_id, where, bind=bind, **kwargs).datasets(
+                dataset_type,
+                collections=collections,
+                find_first=find_first,
+            )
+            if with_dimension_records:
+                result = result.with_dimension_records()
+            refs = list(result)
+        if explain and not refs:
+            raise EmptyQueryResultError(list(result.explain_no_results()))
+        return refs
 
-    @abstractmethod
     def _query_dimension_records(
         self,
         element: str,
@@ -1691,7 +1714,19 @@ def _query_dimension_records(
             when ``collections`` is `None` and default butler collections are
             not defined.
         """
-        raise NotImplementedError()
+        if data_id is None:
+            data_id = DataCoordinate.make_empty(self.dimensions)
+        with self._query() as query:
+            result = (
+                query.where(data_id, where, bind=bind, **kwargs)
+                .dimension_records(element)
+                .order_by(*ensure_iterable(order_by))
+                .limit(limit, offset)
+            )
+            dimension_records = list(result)
+        if explain and not dimension_records:
+            raise EmptyQueryResultError(list(result.explain_no_results()))
+        return dimension_records
 
     @abstractmethod
     def _clone(