Merge pull request #1042 from lsst/tickets/DM-45489

DM-45489: Add more test cover for RemoteButler queries
lsst · Jul 31, 2024 · c108883 · c108883
2 parents 88f4f2d + 66e3efc
commit c108883
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 46 deletions.
diff --git a/python/lsst/daf/butler/queries/convert_args.py b/python/lsst/daf/butler/queries/convert_args.py
@@ -35,6 +35,7 @@
 from collections.abc import Mapping, Set
 from typing import Any
 
+from .._exceptions import InvalidQueryError
 from ..dimensions import DataCoordinate, DataId, DimensionGroup
 from ._expression_strings import convert_expression_string_to_predicate
 from ._identifiers import IdentifierContext, interpret_identifier
@@ -147,6 +148,8 @@ def convert_order_by_args(
                 if arg.startswith("-"):
                     reverse = True
                     arg = arg[1:]
+                if len(arg) == 0:
+                    raise InvalidQueryError("Empty dimension name in ORDER BY")
                 arg = interpret_identifier(context, arg)
                 if reverse:
                     arg = Reversed(operand=arg)

diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -1378,12 +1378,20 @@ def testSkyMapDimensions(self):
         self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
         self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))
 
-        # Specifying non-existing skymap is an exception
-        with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
-            rows = registry.queryDataIds(
+        def do_query():
+            return registry.queryDataIds(
                 dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
             ).toSet()
 
+        if self.supportsQueryGovernorValidation:
+            # Specifying non-existing skymap is an exception
+            with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
+                do_query()
+        else:
+            # New query system returns zero rows instead of raising an
+            # exception.
+            self.assertEqual(len(do_query()), 0)
+
     def testSpatialJoin(self):
         """Test queries that involve spatial overlap joins."""
         registry = self.makeRegistry()
@@ -2663,11 +2671,12 @@ def testSkipCalibs(self):
         )
         with self.assertRaises(exception_type):
             list(registry.queryDatasets("bias", collections=coll_list, findFirst=True))
-        # explicit list will raise if there are temporal dimensions
-        with self.assertRaises(NotImplementedError):
-            registry.queryDataIds(
-                ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
-            ).count()
+        if not self.supportsCalibrationCollectionInFindFirst:
+            # explicit list will raise if there are temporal dimensions
+            with self.assertRaises(NotImplementedError):
+                registry.queryDataIds(
+                    ["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
+                ).count()
 
         # chain will skip
         datasets = list(registry.queryDatasets("bias", collections=chain))
@@ -3008,7 +3017,7 @@ def testQueryResultSummaries(self):
         ]
         with self.assertRaises(MissingDatasetTypeError):
             # Dataset type name doesn't match any existing dataset types.
-            registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...)
+            list(registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...))
         with self.assertRaises(MissingDatasetTypeError):
             # Dataset type name doesn't match any existing dataset types.
             registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...).any()
@@ -3138,12 +3147,20 @@ def testQueryDataIdsExpressionError(self):
         registry = self.makeRegistry()
         self.loadData(registry, "base.yaml")
         bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
-        with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
-            registry.queryDataIds(["detector"], where="foo.bar = 12")
+        # The diagnostics raised are slightly different between the old query
+        # system (ValueError, first error string) and the new query system
+        # (InvalidQueryError, second error string).
         with self.assertRaisesRegex(
-            LookupError, "Dimension element name cannot be inferred in this context."
+            (LookupError, InvalidQueryError),
+            r"(No dimension element with name 'foo' in 'foo\.bar'\.)|(Unrecognized identifier 'foo.bar')",
         ):
-            registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
+            list(registry.queryDataIds(["detector"], where="foo.bar = 12"))
+        with self.assertRaisesRegex(
+            (LookupError, InvalidQueryError),
+            "(Dimension element name cannot be inferred in this context.)"
+            "|(Unrecognized identifier 'timespan')",
+        ):
+            list(registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind))
 
     def testQueryDataIdsOrderBy(self):
         """Test order_by and limit on result returned by queryDataIds()."""
@@ -3229,42 +3246,67 @@ def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
                     with query.materialize():
                         pass
 
-        # errors in a name
+        # Test exceptions for errors in a name.
+        # Many of these raise slightly different diagnostics in the old query
+        # system (ValueError, first error string) than the new query system
+        # (InvalidQueryError, second error string).
         for order_by in ("", "-"):
-            with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
+            with self.assertRaisesRegex((ValueError, InvalidQueryError), "Empty dimension name in ORDER BY"):
                 list(do_query().order_by(order_by))
 
         for order_by in ("undimension.name", "-undimension.name"):
-            with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
+            with self.assertRaisesRegex(
+                (ValueError, InvalidQueryError),
+                "(Unknown dimension element 'undimension')|(Unrecognized identifier 'undimension.name')",
+            ):
                 list(do_query().order_by(order_by))
 
         for order_by in ("attract", "-attract"):
-            with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
+            with self.assertRaisesRegex(
+                (ValueError, InvalidQueryError),
+                "(Metadata 'attract' cannot be found in any dimension)|(Unrecognized identifier 'attract')",
+            ):
                 list(do_query().order_by(order_by))
 
-        with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
+        with self.assertRaisesRegex(
+            (ValueError, InvalidQueryError),
+            "(Metadata 'exposure_time' exists in more than one dimension)"
+            "|(Ambiguous identifier 'exposure_time' matches multiple fields)",
+        ):
             list(do_query(("exposure", "visit")).order_by("exposure_time"))
 
         with self.assertRaisesRegex(
-            ValueError,
-            r"Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
-            r"qualify timespan with specific dimension name\.",
+            (ValueError, InvalidQueryError),
+            r"(Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
+            r"qualify timespan with specific dimension name\.)|"
+            r"(Ambiguous identifier 'timespan' matches multiple fields)",
         ):
             list(do_query(("exposure", "visit")).order_by("timespan.begin"))
 
         with self.assertRaisesRegex(
-            ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
+            (ValueError, InvalidQueryError),
+            "(Cannot find any temporal dimension element for 'timespan.begin')"
+            "|(Unrecognized identifier 'timespan')",
         ):
             list(do_query("tract").order_by("timespan.begin"))
 
-        with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
+        with self.assertRaisesRegex(
+            (ValueError, InvalidQueryError),
+            "(Cannot use 'timespan.begin' with non-temporal element)"
+            "|(Unrecognized field 'timespan' for tract)",
+        ):
             list(do_query("tract").order_by("tract.timespan.begin"))
 
-        with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
+        with self.assertRaisesRegex(
+            (ValueError, InvalidQueryError),
+            "(Field 'name' does not exist in 'tract')" "|(Unrecognized field 'name' for tract.)",
+        ):
             list(do_query("tract").order_by("tract.name"))
 
         with self.assertRaisesRegex(
-            ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
+            (ValueError, InvalidQueryError),
+            r"(Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?)"
+            r"|(Unrecognized identifier 'timestamp.begin')",
         ):
             list(do_query("visit").order_by("timestamp.begin"))
 

diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py
@@ -57,6 +57,7 @@
     Registry,
     RegistryDefaults,
 )
+from ..registry._exceptions import ArgumentError
 from ..registry.queries import (
     ChainedDatasetQueryResults,
     DataCoordinateQueryResults,
@@ -448,6 +449,9 @@ def queryDataIds(
         check: bool = True,
         **kwargs: Any,
     ) -> DataCoordinateQueryResults:
+        if collections is not None and datasets is None:
+            raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")
+
         dimensions = self.dimensions.conform(dimensions)
         args = self._convert_common_query_arguments(
             dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections

diff --git a/python/lsst/daf/butler/tests/hybrid_butler_registry.py b/python/lsst/daf/butler/tests/hybrid_butler_registry.py
@@ -28,7 +28,7 @@
 from __future__ import annotations
 
 import contextlib
-from collections.abc import Iterable, Iterator, Mapping, Sequence
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from typing import Any, cast
 
 from .._dataset_association import DatasetAssociation
@@ -312,17 +312,6 @@ def queryDataIds(
         check: bool = True,
         **kwargs: Any,
     ) -> DataCoordinateQueryResults:
-        direct = self._direct.queryDataIds(
-            dimensions,
-            dataId=dataId,
-            datasets=datasets,
-            collections=collections,
-            where=where,
-            bind=bind,
-            check=check,
-            **kwargs,
-        )
-
         remote = self._remote.queryDataIds(
             dimensions,
             dataId=dataId,
@@ -334,8 +323,25 @@ def queryDataIds(
             **kwargs,
         )
 
+        # Defer creation of the DirectButler version until we really need the
+        # object for handling an unimplemented method.  This avoids masking of
+        # missing exception handling in the RemoteButler side -- otherwise
+        # exceptions from DirectButler would cause tests to pass.
+        def create_direct_result() -> DataCoordinateQueryResults:
+            return self._direct.queryDataIds(
+                dimensions,
+                dataId=dataId,
+                datasets=datasets,
+                collections=collections,
+                where=where,
+                bind=bind,
+                check=check,
+                **kwargs,
+            )
+
         return cast(
-            DataCoordinateQueryResults, _HybridDataCoordinateQueryResults(direct=direct, remote=remote)
+            DataCoordinateQueryResults,
+            _HybridDataCoordinateQueryResults(direct=create_direct_result, remote=remote),
         )
 
     def queryDimensionRecords(
@@ -388,7 +394,9 @@ class _HybridDataCoordinateQueryResults:
     provide a few methods that aren't implemented yet.
     """
 
-    def __init__(self, *, direct: DataCoordinateQueryResults, remote: DataCoordinateQueryResults) -> None:
+    def __init__(
+        self, *, direct: Callable[[], DataCoordinateQueryResults], remote: DataCoordinateQueryResults
+    ) -> None:
         self._direct = direct
         self._remote = remote
 
@@ -401,20 +409,20 @@ def __iter__(self) -> Iterator[DataCoordinate]:
 
     def order_by(self, *args: str) -> _HybridDataCoordinateQueryResults:
         return _HybridDataCoordinateQueryResults(
-            direct=self._direct.order_by(*args), remote=self._remote.order_by(*args)
+            direct=lambda: self._direct().order_by(*args), remote=self._remote.order_by(*args)
         )
 
     def limit(self, limit: int, offset: int | None = 0) -> _HybridDataCoordinateQueryResults:
         return _HybridDataCoordinateQueryResults(
-            direct=self._direct.limit(limit, offset), remote=self._remote.limit(limit, offset)
+            direct=lambda: self._direct().limit(limit, offset), remote=self._remote.limit(limit, offset)
         )
 
     def materialize(self) -> contextlib.AbstractContextManager[DataCoordinateQueryResults]:
-        return self._direct.materialize()
+        return self._direct().materialize()
 
     def expanded(self) -> _HybridDataCoordinateQueryResults:
         return _HybridDataCoordinateQueryResults(
-            remote=self._remote.expanded(), direct=self._direct.expanded()
+            remote=self._remote.expanded(), direct=lambda: self._direct().expanded()
         )
 
     def subset(
@@ -424,7 +432,7 @@ def subset(
         unique: bool = False,
     ) -> _HybridDataCoordinateQueryResults:
         return _HybridDataCoordinateQueryResults(
-            direct=self._direct.subset(dimensions, unique=unique),
+            direct=lambda: self._direct().subset(dimensions, unique=unique),
             remote=self._remote.subset(dimensions, unique=unique),
         )
 
@@ -436,7 +444,9 @@ def findDatasets(
         findFirst: bool = True,
         components: bool = False,
     ) -> ParentDatasetQueryResults:
-        return self._direct.findDatasets(datasetType, collections, findFirst=findFirst, components=components)
+        return self._direct().findDatasets(
+            datasetType, collections, findFirst=findFirst, components=components
+        )
 
     def findRelatedDatasets(
         self,
@@ -446,6 +456,6 @@ def findRelatedDatasets(
         findFirst: bool = True,
         dimensions: DimensionGroup | Iterable[str] | None = None,
     ) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
-        return self._direct.findRelatedDatasets(
+        return self._direct().findRelatedDatasets(
             datasetType, collections, findFirst=findFirst, dimensions=dimensions
         )