Skip to content

Commit

Permalink
Merge pull request #1042 from lsst/tickets/DM-45489
Browse files Browse the repository at this point in the history
DM-45489: Add more test cover for RemoteButler queries
  • Loading branch information
dhirving authored Jul 31, 2024
2 parents 88f4f2d + 66e3efc commit c108883
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 46 deletions.
3 changes: 3 additions & 0 deletions python/lsst/daf/butler/queries/convert_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from collections.abc import Mapping, Set
from typing import Any

from .._exceptions import InvalidQueryError
from ..dimensions import DataCoordinate, DataId, DimensionGroup
from ._expression_strings import convert_expression_string_to_predicate
from ._identifiers import IdentifierContext, interpret_identifier
Expand Down Expand Up @@ -147,6 +148,8 @@ def convert_order_by_args(
if arg.startswith("-"):
reverse = True
arg = arg[1:]
if len(arg) == 0:
raise InvalidQueryError("Empty dimension name in ORDER BY")
arg = interpret_identifier(context, arg)
if reverse:
arg = Reversed(operand=arg)
Expand Down
92 changes: 67 additions & 25 deletions python/lsst/daf/butler/registry/tests/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1378,12 +1378,20 @@ def testSkyMapDimensions(self):
self.assertCountEqual({dataId["patch"] for dataId in rows}, (2, 4, 6, 7))
self.assertCountEqual({dataId["band"] for dataId in rows}, ("i",))

# Specifying non-existing skymap is an exception
with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
rows = registry.queryDataIds(
def do_query():
return registry.queryDataIds(
dimensions, datasets=[calexpType, mergeType], collections=run, where="skymap = 'Mars'"
).toSet()

if self.supportsQueryGovernorValidation:
# Specifying non-existing skymap is an exception
with self.assertRaisesRegex(DataIdValueError, "Unknown values specified for governor dimension"):
do_query()
else:
# New query system returns zero rows instead of raising an
# exception.
self.assertEqual(len(do_query()), 0)

def testSpatialJoin(self):
"""Test queries that involve spatial overlap joins."""
registry = self.makeRegistry()
Expand Down Expand Up @@ -2663,11 +2671,12 @@ def testSkipCalibs(self):
)
with self.assertRaises(exception_type):
list(registry.queryDatasets("bias", collections=coll_list, findFirst=True))
# explicit list will raise if there are temporal dimensions
with self.assertRaises(NotImplementedError):
registry.queryDataIds(
["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
).count()
if not self.supportsCalibrationCollectionInFindFirst:
# explicit list will raise if there are temporal dimensions
with self.assertRaises(NotImplementedError):
registry.queryDataIds(
["instrument", "detector", "exposure"], datasets="bias", collections=coll_list
).count()

# chain will skip
datasets = list(registry.queryDatasets("bias", collections=chain))
Expand Down Expand Up @@ -3008,7 +3017,7 @@ def testQueryResultSummaries(self):
]
with self.assertRaises(MissingDatasetTypeError):
# Dataset type name doesn't match any existing dataset types.
registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...)
list(registry.queryDataIds(["detector"], datasets=["nonexistent"], collections=...))
with self.assertRaises(MissingDatasetTypeError):
# Dataset type name doesn't match any existing dataset types.
registry.queryDimensionRecords("detector", datasets=["nonexistent"], collections=...).any()
Expand Down Expand Up @@ -3138,12 +3147,20 @@ def testQueryDataIdsExpressionError(self):
registry = self.makeRegistry()
self.loadData(registry, "base.yaml")
bind = {"time": astropy.time.Time("2020-01-01T01:00:00", format="isot", scale="tai")}
with self.assertRaisesRegex(LookupError, r"No dimension element with name 'foo' in 'foo\.bar'\."):
registry.queryDataIds(["detector"], where="foo.bar = 12")
# The diagnostics raised are slightly different between the old query
# system (ValueError, first error string) and the new query system
# (InvalidQueryError, second error string).
with self.assertRaisesRegex(
LookupError, "Dimension element name cannot be inferred in this context."
(LookupError, InvalidQueryError),
r"(No dimension element with name 'foo' in 'foo\.bar'\.)|(Unrecognized identifier 'foo.bar')",
):
registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind)
list(registry.queryDataIds(["detector"], where="foo.bar = 12"))
with self.assertRaisesRegex(
(LookupError, InvalidQueryError),
"(Dimension element name cannot be inferred in this context.)"
"|(Unrecognized identifier 'timespan')",
):
list(registry.queryDataIds(["detector"], where="timespan.end < time", bind=bind))

def testQueryDataIdsOrderBy(self):
"""Test order_by and limit on result returned by queryDataIds()."""
Expand Down Expand Up @@ -3229,42 +3246,67 @@ def do_query(dimensions=("visit", "tract"), datasets=None, collections=None):
with query.materialize():
pass

# errors in a name
# Test exceptions for errors in a name.
# Many of these raise slightly different diagnostics in the old query
# system (ValueError, first error string) than the new query system
# (InvalidQueryError, second error string).
for order_by in ("", "-"):
with self.assertRaisesRegex(ValueError, "Empty dimension name in ORDER BY"):
with self.assertRaisesRegex((ValueError, InvalidQueryError), "Empty dimension name in ORDER BY"):
list(do_query().order_by(order_by))

for order_by in ("undimension.name", "-undimension.name"):
with self.assertRaisesRegex(ValueError, "Unknown dimension element 'undimension'"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Unknown dimension element 'undimension')|(Unrecognized identifier 'undimension.name')",
):
list(do_query().order_by(order_by))

for order_by in ("attract", "-attract"):
with self.assertRaisesRegex(ValueError, "Metadata 'attract' cannot be found in any dimension"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Metadata 'attract' cannot be found in any dimension)|(Unrecognized identifier 'attract')",
):
list(do_query().order_by(order_by))

with self.assertRaisesRegex(ValueError, "Metadata 'exposure_time' exists in more than one dimension"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Metadata 'exposure_time' exists in more than one dimension)"
"|(Ambiguous identifier 'exposure_time' matches multiple fields)",
):
list(do_query(("exposure", "visit")).order_by("exposure_time"))

with self.assertRaisesRegex(
ValueError,
r"Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
r"qualify timespan with specific dimension name\.",
(ValueError, InvalidQueryError),
r"(Timespan exists in more than one dimension element \(day_obs, exposure, visit\); "
r"qualify timespan with specific dimension name\.)|"
r"(Ambiguous identifier 'timespan' matches multiple fields)",
):
list(do_query(("exposure", "visit")).order_by("timespan.begin"))

with self.assertRaisesRegex(
ValueError, "Cannot find any temporal dimension element for 'timespan.begin'"
(ValueError, InvalidQueryError),
"(Cannot find any temporal dimension element for 'timespan.begin')"
"|(Unrecognized identifier 'timespan')",
):
list(do_query("tract").order_by("timespan.begin"))

with self.assertRaisesRegex(ValueError, "Cannot use 'timespan.begin' with non-temporal element"):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Cannot use 'timespan.begin' with non-temporal element)"
"|(Unrecognized field 'timespan' for tract)",
):
list(do_query("tract").order_by("tract.timespan.begin"))

with self.assertRaisesRegex(ValueError, "Field 'name' does not exist in 'tract'."):
with self.assertRaisesRegex(
(ValueError, InvalidQueryError),
"(Field 'name' does not exist in 'tract')" "|(Unrecognized field 'name' for tract.)",
):
list(do_query("tract").order_by("tract.name"))

with self.assertRaisesRegex(
ValueError, r"Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?"
(ValueError, InvalidQueryError),
r"(Unknown dimension element 'timestamp'; perhaps you meant 'timespan.begin'\?)"
r"|(Unrecognized identifier 'timestamp.begin')",
):
list(do_query("visit").order_by("timestamp.begin"))

Expand Down
4 changes: 4 additions & 0 deletions python/lsst/daf/butler/remote_butler/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
Registry,
RegistryDefaults,
)
from ..registry._exceptions import ArgumentError
from ..registry.queries import (
ChainedDatasetQueryResults,
DataCoordinateQueryResults,
Expand Down Expand Up @@ -448,6 +449,9 @@ def queryDataIds(
check: bool = True,
**kwargs: Any,
) -> DataCoordinateQueryResults:
if collections is not None and datasets is None:
raise ArgumentError(f"Cannot pass 'collections' (='{collections}') without 'datasets'.")

dimensions = self.dimensions.conform(dimensions)
args = self._convert_common_query_arguments(
dataId=dataId, where=where, bind=bind, kwargs=kwargs, datasets=datasets, collections=collections
Expand Down
52 changes: 31 additions & 21 deletions python/lsst/daf/butler/tests/hybrid_butler_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from __future__ import annotations

import contextlib
from collections.abc import Iterable, Iterator, Mapping, Sequence
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from typing import Any, cast

from .._dataset_association import DatasetAssociation
Expand Down Expand Up @@ -312,17 +312,6 @@ def queryDataIds(
check: bool = True,
**kwargs: Any,
) -> DataCoordinateQueryResults:
direct = self._direct.queryDataIds(
dimensions,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

remote = self._remote.queryDataIds(
dimensions,
dataId=dataId,
Expand All @@ -334,8 +323,25 @@ def queryDataIds(
**kwargs,
)

# Defer creation of the DirectButler version until we really need the
# object for handling an unimplemented method. This avoids masking of
# missing exception handling in the RemoteButler side -- otherwise
# exceptions from DirectButler would cause tests to pass.
def create_direct_result() -> DataCoordinateQueryResults:
return self._direct.queryDataIds(
dimensions,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

return cast(
DataCoordinateQueryResults, _HybridDataCoordinateQueryResults(direct=direct, remote=remote)
DataCoordinateQueryResults,
_HybridDataCoordinateQueryResults(direct=create_direct_result, remote=remote),
)

def queryDimensionRecords(
Expand Down Expand Up @@ -388,7 +394,9 @@ class _HybridDataCoordinateQueryResults:
provide a few methods that aren't implemented yet.
"""

def __init__(self, *, direct: DataCoordinateQueryResults, remote: DataCoordinateQueryResults) -> None:
def __init__(
self, *, direct: Callable[[], DataCoordinateQueryResults], remote: DataCoordinateQueryResults
) -> None:
self._direct = direct
self._remote = remote

Expand All @@ -401,20 +409,20 @@ def __iter__(self) -> Iterator[DataCoordinate]:

def order_by(self, *args: str) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.order_by(*args), remote=self._remote.order_by(*args)
direct=lambda: self._direct().order_by(*args), remote=self._remote.order_by(*args)
)

def limit(self, limit: int, offset: int | None = 0) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.limit(limit, offset), remote=self._remote.limit(limit, offset)
direct=lambda: self._direct().limit(limit, offset), remote=self._remote.limit(limit, offset)
)

def materialize(self) -> contextlib.AbstractContextManager[DataCoordinateQueryResults]:
return self._direct.materialize()
return self._direct().materialize()

def expanded(self) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
remote=self._remote.expanded(), direct=self._direct.expanded()
remote=self._remote.expanded(), direct=lambda: self._direct().expanded()
)

def subset(
Expand All @@ -424,7 +432,7 @@ def subset(
unique: bool = False,
) -> _HybridDataCoordinateQueryResults:
return _HybridDataCoordinateQueryResults(
direct=self._direct.subset(dimensions, unique=unique),
direct=lambda: self._direct().subset(dimensions, unique=unique),
remote=self._remote.subset(dimensions, unique=unique),
)

Expand All @@ -436,7 +444,9 @@ def findDatasets(
findFirst: bool = True,
components: bool = False,
) -> ParentDatasetQueryResults:
return self._direct.findDatasets(datasetType, collections, findFirst=findFirst, components=components)
return self._direct().findDatasets(
datasetType, collections, findFirst=findFirst, components=components
)

def findRelatedDatasets(
self,
Expand All @@ -446,6 +456,6 @@ def findRelatedDatasets(
findFirst: bool = True,
dimensions: DimensionGroup | Iterable[str] | None = None,
) -> Iterable[tuple[DataCoordinate, DatasetRef]]:
return self._direct.findRelatedDatasets(
return self._direct().findRelatedDatasets(
datasetType, collections, findFirst=findFirst, dimensions=dimensions
)

0 comments on commit c108883

Please sign in to comment.