From 0176c38ba82a876c2e8737a74a282bec76a3578d Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 26 Feb 2024 10:35:24 -0700 Subject: [PATCH 1/5] Handle more collection inputs in RemoteButler RemoteButler now passes globs to the server if one is given as a collections parameter, instead of silently discarding them. It can also now handle "..." to mean "all collections". Explicit exceptions are now raised for regular expressions, which will not be supported in the future. --- .../butler/remote_butler/_collection_args.py | 57 +++++++++++++++++++ .../butler/remote_butler/_remote_butler.py | 24 ++++---- .../daf/butler/remote_butler/server_models.py | 2 + 3 files changed, 70 insertions(+), 13 deletions(-) create mode 100644 python/lsst/daf/butler/remote_butler/_collection_args.py diff --git a/python/lsst/daf/butler/remote_butler/_collection_args.py b/python/lsst/daf/butler/remote_butler/_collection_args.py new file mode 100644 index 0000000000..364a174f8a --- /dev/null +++ b/python/lsst/daf/butler/remote_butler/_collection_args.py @@ -0,0 +1,57 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +import re +from typing import cast + +from ..registry import CollectionArgType +from ..registry.wildcards import CollectionWildcard +from .server_models import CollectionList + + +def convert_collection_arg_to_glob_string_list(arg: CollectionArgType) -> CollectionList: + if arg is ...: + return CollectionList(["*"]) + elif isinstance(arg, str): + return CollectionList([arg]) + elif isinstance(arg, re.Pattern): + raise TypeError("RemoteButler does not support regular expressions as search patterns") + elif isinstance(arg, CollectionWildcard): + # In theory this could work, but CollectionWildcard eagerly converts + # any glob strings to regexes. So it would need some rework to + # preserve the original string inputs. CollectionWildcard has likely + # never been used in the wild by an end-user so this probably isn't + # worth figuring out. + raise TypeError("RemoteButler does not accept CollectionWildcard instances as search patterns") + else: + search = list(arg) + for item in search: + if not isinstance(item, str): + raise TypeError("RemoteButler only accepts strings and lists of strings as search patterns") + return CollectionList(cast(list[str], search)) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index ed8b8afe29..aef303fc9d 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -60,9 +60,15 @@ from .._utilities.locked_object import LockedObject from ..datastore import DatasetRefURIs from ..dimensions import DataCoordinate, DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId -from ..registry import MissingDatasetTypeError, NoDefaultCollectionError, Registry, RegistryDefaults -from ..registry.wildcards import CollectionWildcard +from ..registry import ( + CollectionArgType, + MissingDatasetTypeError, + NoDefaultCollectionError, + Registry, + RegistryDefaults, +) from ._authentication import get_authentication_headers +from ._collection_args import convert_collection_arg_to_glob_string_list from .server_models import ( CLIENT_REQUEST_ID_HEADER_NAME, CollectionList, @@ -78,15 +84,11 @@ from .._query import Query from .._timespan import Timespan from ..dimensions import DataId, DimensionGroup, DimensionRecord - from ..registry import CollectionArgType from ..transfers import RepoExportContext _AnyPydanticModel = TypeVar("_AnyPydanticModel", bound=BaseModel) """Generic type variable that accepts any Pydantic model class.""" -_InputCollectionList = str | Sequence[str] | None -"""The possible types of the ``collections`` parameter of most Butler methods. -""" _SERIALIZED_DATA_ID_TYPE_ADAPTER = TypeAdapter(SerializedDataId) @@ -299,7 +301,7 @@ def _get_file_info( self, datasetRefOrType: DatasetRef | DatasetType | str, dataId: DataId | None, - collections: _InputCollectionList, + collections: CollectionArgType, kwargs: dict[str, DataIdValue], ) -> GetFileResponseModel: """Send a request to the server for the file URLs and metadata @@ -713,7 +715,7 @@ def _parse_model(self, response: httpx.Response, model: type[_AnyPydanticModel]) """Deserialize a Pydantic model from the body of an HTTP response.""" return model.model_validate_json(response.content) - def _normalize_collections(self, collections: _InputCollectionList) -> CollectionList: + def _normalize_collections(self, collections: CollectionArgType | None) -> CollectionList: """Convert the ``collections`` parameter in the format used by Butler methods to a standardized format for the REST API. """ @@ -723,11 +725,7 @@ def _normalize_collections(self, collections: _InputCollectionList) -> Collectio "No collections provided, and no defaults from butler construction." ) collections = self.collections - # Temporary hack. Assume strings for collections. In future - # want to construct CollectionWildcard and filter it through collection - # cache to generate list of collection names. - wildcards = CollectionWildcard.from_expression(collections) - return CollectionList(list(wildcards.strings)) + return convert_collection_arg_to_glob_string_list(collections) def _normalize_dataset_type_name(self, datasetTypeOrName: DatasetType | str) -> DatasetTypeName: """Convert DatasetType parameters in the format used by Butler methods diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 07474543a1..6d1c431a8b 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -44,6 +44,8 @@ CLIENT_REQUEST_ID_HEADER_NAME = "X-Butler-Client-Request-Id" CollectionList = NewType("CollectionList", list[str]) +"""A list of search patterns for collection names. May use glob +syntax to specify wildcards.""" DatasetTypeName = NewType("DatasetTypeName", str) From f862cc6e0f856d068dadf42e56248b427575eee5 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 26 Feb 2024 11:27:55 -0700 Subject: [PATCH 2/5] Allow returning None from RemoteButler.find_dataset Fix an issue where RemoteButler would throw validation errors instead of allowing 'None' to be returned from find_dataset. --- .../butler/remote_butler/_remote_butler.py | 13 ++++++++----- .../server/handlers/_external.py | 19 +++++++------------ .../daf/butler/remote_butler/server_models.py | 11 +++++++++-- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index aef303fc9d..dd18352140 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -73,7 +73,8 @@ CLIENT_REQUEST_ID_HEADER_NAME, CollectionList, DatasetTypeName, - FindDatasetModel, + FindDatasetRequestModel, + FindDatasetResponseModel, GetFileByDataIdRequestModel, GetFileResponseModel, ) @@ -417,7 +418,7 @@ def find_dataset( dataset_type = self._normalize_dataset_type_name(dataset_type) - query = FindDatasetModel( + query = FindDatasetRequestModel( data_id=self._simplify_dataId(data_id, kwargs), collections=self._normalize_collections(collections), dimension_records=dimension_records, @@ -427,9 +428,11 @@ def find_dataset( path = f"find_dataset/{dataset_type}" response = self._post(path, query) - ref = DatasetRef.from_simple( - self._parse_model(response, SerializedDatasetRef), universe=self.dimensions - ) + model = self._parse_model(response, FindDatasetResponseModel) + if model.dataset_ref is None: + return None + + ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions) if storage_class is not None: ref = ref.overrideStorageClass(storage_class) return ref diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index c745ee2c46..6f1153dbed 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -33,7 +33,8 @@ from fastapi import APIRouter, Depends from lsst.daf.butler import Butler, DatasetRef, SerializedDatasetRef, SerializedDatasetType from lsst.daf.butler.remote_butler.server_models import ( - FindDatasetModel, + FindDatasetRequestModel, + FindDatasetResponseModel, GetFileByDataIdRequestModel, GetFileResponseModel, ) @@ -122,32 +123,26 @@ def get_dataset( # Not yet supported: TimeSpan is not yet a pydantic model. -# collections parameter assumes client-side has resolved regexes. @external_router.post( "/v1/find_dataset/{dataset_type}", summary="Retrieve this dataset definition from collection, dataset type, and dataId", - response_model=SerializedDatasetRef, - response_model_exclude_unset=True, - response_model_exclude_defaults=True, - response_model_exclude_none=True, ) def find_dataset( dataset_type: str, - query: FindDatasetModel, + query: FindDatasetRequestModel, factory: Factory = Depends(factory_dependency), -) -> SerializedDatasetRef | None: - collection_query = query.collections if query.collections else None - +) -> FindDatasetResponseModel: butler = factory.create_butler() ref = butler.find_dataset( dataset_type, query.data_id, - collections=collection_query, + collections=query.collections, timespan=None, dimension_records=query.dimension_records, datastore_records=query.datastore_records, ) - return ref.to_simple() if ref else None + serialized_ref = ref.to_simple() if ref else None + return FindDatasetResponseModel(dataset_ref=serialized_ref) @external_router.get( diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 6d1c431a8b..53dff78165 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -31,7 +31,8 @@ "CLIENT_REQUEST_ID_HEADER_NAME", "CollectionList", "DatasetTypeName", - "FindDatasetModel", + "FindDatasetRequestModel", + "FindDatasetResponseModel", "GetFileResponseModel", ] @@ -49,7 +50,7 @@ DatasetTypeName = NewType("DatasetTypeName", str) -class FindDatasetModel(pydantic.BaseModel): +class FindDatasetRequestModel(pydantic.BaseModel): """Request model for find_dataset.""" data_id: SerializedDataId @@ -58,6 +59,12 @@ class FindDatasetModel(pydantic.BaseModel): datastore_records: bool = False +class FindDatasetResponseModel(pydantic.BaseModel): + """Response model for find_dataset.""" + + dataset_ref: SerializedDatasetRef | None + + class GetFileByDataIdRequestModel(pydantic.BaseModel): """Request model for ``get_file_by_data_id``.""" From ec1976216635d91e02035653fec5ed7468b4c0a6 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 26 Feb 2024 12:07:19 -0700 Subject: [PATCH 3/5] Handle storage classes in RemoteButler.find_dataset Handle propagation of storage classes from the input DatasetType to the output DatasetRef in RemoteButler.find_dataset, to match DirectButler behavior. --- .../butler/datastores/fileDatastoreClient.py | 18 +----- .../butler/remote_butler/_remote_butler.py | 56 +++++++++++++------ 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastoreClient.py b/python/lsst/daf/butler/datastores/fileDatastoreClient.py index 5fcc8e4780..f821fe01d3 100644 --- a/python/lsst/daf/butler/datastores/fileDatastoreClient.py +++ b/python/lsst/daf/butler/datastores/fileDatastoreClient.py @@ -3,7 +3,7 @@ from typing import Any, Literal import pydantic -from lsst.daf.butler import DatasetRef, Location, StorageClass +from lsst.daf.butler import DatasetRef, Location from lsst.daf.butler.datastore.cache_manager import DatastoreDisabledCacheManager from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo from lsst.daf.butler.datastores.file_datastore.get import ( @@ -47,8 +47,6 @@ def get_dataset_as_python_object( payload: FileDatastoreGetPayload, *, parameters: Mapping[str, Any] | None, - storageClass: StorageClass | str | None, - component: str | None, ) -> Any: """Retrieve an artifact from storage and return it as a Python object. @@ -62,12 +60,6 @@ def get_dataset_as_python_object( parameters : `Mapping`[`str`, `typing.Any`] `StorageClass` and `Formatter` parameters to be used when converting the artifact to a Python object. - storageClass : `StorageClass` | `str` | `None` - Overrides the `StorageClass` to be used when converting the artifact to - a Python object. If `None`, uses the `StorageClass` specified by - ``payload``. - component : `str` | `None` - Selects which component of the artifact to retrieve. Returns ------- @@ -79,14 +71,6 @@ def get_dataset_as_python_object( for file_info in payload.file_info ] - # If we have both a component override and a storage class override, the - # component override has to be applied first. DatasetRef cares because it - # is checking compatibility of the storage class with its DatasetType. - if component is not None: - ref = ref.makeComponentRef(component) - if storageClass is not None: - ref = ref.overrideStorageClass(storageClass) - datastore_file_info = generate_datastore_get_information( fileLocations, ref=ref, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index dd18352140..aa1870322b 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -274,28 +274,20 @@ def get( # Docstring inherited. model = self._get_file_info(datasetRefOrType, dataId, collections, kwargs) - # If the caller provided a DatasetRef or DatasetType, they may have - # overridden the storage class on it. We need to respect this, if they - # haven't asked to re-override it. - explicitDatasetType = _extract_dataset_type(datasetRefOrType) - if explicitDatasetType is not None: - if storageClass is None: - storageClass = explicitDatasetType.storageClass - + ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions) # If the caller provided a DatasetRef, they may have overridden the # component on it. We need to explicitly handle this because we did # not send the DatasetType to the server in this case. - componentOverride = None if isinstance(datasetRefOrType, DatasetRef): componentOverride = datasetRefOrType.datasetType.component() + if componentOverride: + ref = ref.makeComponentRef(componentOverride) + ref = _apply_storage_class_override(ref, datasetRefOrType, storageClass) - ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions) return get_dataset_as_python_object( ref, _to_file_payload(model), parameters=parameters, - storageClass=storageClass, - component=componentOverride, ) def _get_file_info( @@ -416,8 +408,6 @@ def find_dataset( if timespan: raise ValueError("Timespan can not yet be used in butler client/server.") - dataset_type = self._normalize_dataset_type_name(dataset_type) - query = FindDatasetRequestModel( data_id=self._simplify_dataId(data_id, kwargs), collections=self._normalize_collections(collections), @@ -425,7 +415,8 @@ def find_dataset( datastore_records=datastore_records, ) - path = f"find_dataset/{dataset_type}" + dataset_type_name = self._normalize_dataset_type_name(dataset_type) + path = f"find_dataset/{dataset_type_name}" response = self._post(path, query) model = self._parse_model(response, FindDatasetResponseModel) @@ -433,9 +424,7 @@ def find_dataset( return None ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions) - if storage_class is not None: - ref = ref.overrideStorageClass(storage_class) - return ref + return _apply_storage_class_override(ref, dataset_type, storage_class) def retrieveArtifacts( self, @@ -777,6 +766,37 @@ def _extract_dataset_type(datasetRefOrType: DatasetRef | DatasetType | str) -> D return None +def _apply_storage_class_override( + ref: DatasetRef, + original_dataset_ref_or_type: DatasetRef | DatasetType | str, + explicit_storage_class: StorageClass | str | None, +) -> DatasetRef: + """Return a DatasetRef with its storage class overridden to match the + StorageClass supplied by the user as input to one of the search functions. + + Parameters + ---------- + ref : `DatasetRef` + The ref to which we will apply the StorageClass override. + original_dataset_ref_or_type : `DatasetRef` | `DatasetType` | `str` + The ref or type that was input to the search, which may contain a + storage class override. + explicit_storage_class : `StorageClass` | `str` | `None` + A storage class that the user explicitly requested as an override. + """ + if explicit_storage_class is not None: + return ref.overrideStorageClass(explicit_storage_class) + + # If the caller provided a DatasetRef or DatasetType, they may have + # overridden the storage class on it, and we need to propagate that to the + # output. + dataset_type = _extract_dataset_type(original_dataset_ref_or_type) + if dataset_type is not None: + return ref.overrideStorageClass(dataset_type.storageClass) + + return ref + + def _to_file_payload(get_file_response: GetFileResponseModel) -> FileDatastoreGetPayload: if get_file_response.artifact is None: ref = get_file_response.dataset_ref From ce12d41859f9a62be8d2ac145133f7cb62e626c0 Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 26 Feb 2024 14:14:28 -0700 Subject: [PATCH 4/5] Support timespan in RemoteButler.find_dataset Add support for the timespan parameter to RemoteButler.find_dataset. --- python/lsst/daf/butler/_timespan.py | 17 +++++++++++++---- .../daf/butler/remote_butler/_remote_butler.py | 3 +-- .../remote_butler/server/handlers/_external.py | 5 +++-- .../daf/butler/remote_butler/server_models.py | 3 ++- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/_timespan.py b/python/lsst/daf/butler/_timespan.py index 095c50a0ab..1d5518b898 100644 --- a/python/lsst/daf/butler/_timespan.py +++ b/python/lsst/daf/butler/_timespan.py @@ -26,16 +26,20 @@ # along with this program. If not, see . from __future__ import annotations -__all__ = ("Timespan",) +__all__ = ( + "SerializedTimespan", + "Timespan", +) import enum import warnings from collections.abc import Generator -from typing import TYPE_CHECKING, Any, ClassVar, Union +from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Union import astropy.time import astropy.utils.exceptions import yaml +from pydantic import Field # As of astropy 4.2, the erfa interface is shipped independently and # ErfaWarning is no longer an AstropyWarning @@ -69,6 +73,11 @@ class _SpecialTimespanBound(enum.Enum): TimespanBound = Union[astropy.time.Time, _SpecialTimespanBound, None] +SerializedTimespan = Annotated[list[int], Field(min_length=2, max_length=2)] +"""JSON-serializable representation of the Timespan class, as a list of two +integers ``[begin, end]`` in nanoseconds since the epoch. +""" + class Timespan: """A half-open time interval with nanosecond precision. @@ -483,7 +492,7 @@ def difference(self, other: Timespan) -> Generator[Timespan, None, None]: if intersection._nsec[1] < self._nsec[1]: yield Timespan(None, None, _nsec=(intersection._nsec[1], self._nsec[1])) - def to_simple(self, minimal: bool = False) -> list[int]: + def to_simple(self, minimal: bool = False) -> SerializedTimespan: """Return simple python type form suitable for serialization. Parameters @@ -502,7 +511,7 @@ def to_simple(self, minimal: bool = False) -> list[int]: @classmethod def from_simple( cls, - simple: list[int] | None, + simple: SerializedTimespan | None, universe: DimensionUniverse | None = None, registry: Registry | None = None, ) -> Timespan | None: diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index aa1870322b..1352e41dbb 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -405,12 +405,11 @@ def find_dataset( ) -> DatasetRef | None: if datastore_records: raise ValueError("Datastore records can not yet be returned in client/server butler.") - if timespan: - raise ValueError("Timespan can not yet be used in butler client/server.") query = FindDatasetRequestModel( data_id=self._simplify_dataId(data_id, kwargs), collections=self._normalize_collections(collections), + timespan=timespan.to_simple() if timespan is not None else None, dimension_records=dimension_records, datastore_records=datastore_records, ) diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 6f1153dbed..2c86cf0cf2 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -31,7 +31,7 @@ from typing import Any from fastapi import APIRouter, Depends -from lsst.daf.butler import Butler, DatasetRef, SerializedDatasetRef, SerializedDatasetType +from lsst.daf.butler import Butler, DatasetRef, SerializedDatasetRef, SerializedDatasetType, Timespan from lsst.daf.butler.remote_butler.server_models import ( FindDatasetRequestModel, FindDatasetResponseModel, @@ -133,11 +133,12 @@ def find_dataset( factory: Factory = Depends(factory_dependency), ) -> FindDatasetResponseModel: butler = factory.create_butler() + timespan = Timespan.from_simple(query.timespan) if query.timespan is not None else None ref = butler.find_dataset( dataset_type, query.data_id, collections=query.collections, - timespan=None, + timespan=timespan, dimension_records=query.dimension_records, datastore_records=query.datastore_records, ) diff --git a/python/lsst/daf/butler/remote_butler/server_models.py b/python/lsst/daf/butler/remote_butler/server_models.py index 53dff78165..c32b223c0f 100644 --- a/python/lsst/daf/butler/remote_butler/server_models.py +++ b/python/lsst/daf/butler/remote_butler/server_models.py @@ -39,7 +39,7 @@ from typing import NewType import pydantic -from lsst.daf.butler import SerializedDataId, SerializedDatasetRef +from lsst.daf.butler import SerializedDataId, SerializedDatasetRef, SerializedTimespan from lsst.daf.butler.datastores.fileDatastoreClient import FileDatastoreGetPayload CLIENT_REQUEST_ID_HEADER_NAME = "X-Butler-Client-Request-Id" @@ -55,6 +55,7 @@ class FindDatasetRequestModel(pydantic.BaseModel): data_id: SerializedDataId collections: CollectionList + timespan: SerializedTimespan | None dimension_records: bool = False datastore_records: bool = False From b7a08837a4908979b5bd03a2b475b845a009bebc Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Mon, 26 Feb 2024 12:31:54 -0700 Subject: [PATCH 5/5] Implement RemoteButler.registry.findDatasets Add a method in RemoteButler's registry shim for findDatasets, and run registry tests against it. --- .../butler/remote_butler/_collection_args.py | 19 ++++++++ .../daf/butler/remote_butler/_registry.py | 48 +++++++++++++++++-- .../server/handlers/_external.py | 1 - .../butler/tests/hybrid_butler_registry.py | 2 +- 4 files changed, 63 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_collection_args.py b/python/lsst/daf/butler/remote_butler/_collection_args.py index 364a174f8a..37588731be 100644 --- a/python/lsst/daf/butler/remote_butler/_collection_args.py +++ b/python/lsst/daf/butler/remote_butler/_collection_args.py @@ -36,6 +36,25 @@ def convert_collection_arg_to_glob_string_list(arg: CollectionArgType) -> CollectionList: + """Convert the collections argument used by many Butler/registry methods to + a format suitable for sending to Butler server. + + Parameters + ---------- + arg : `CollectionArgType` + Collection search pattern in many possible formats. + + Returns + ------- + glob_list : `CollectionList` + Collection search patterns normalized to a list of globs string. + + Raises + ------ + TypeError + If the search pattern provided by the user cannot be converted to a + glob string. + """ if arg is ...: return CollectionList(["*"]) elif isinstance(arg, str): diff --git a/python/lsst/daf/butler/remote_butler/_registry.py b/python/lsst/daf/butler/remote_butler/_registry.py index 3e8d507f6d..7c87af5e12 100644 --- a/python/lsst/daf/butler/remote_butler/_registry.py +++ b/python/lsst/daf/butler/remote_butler/_registry.py @@ -47,9 +47,17 @@ DimensionRecord, DimensionUniverse, ) -from ..registry import CollectionArgType, CollectionSummary, CollectionType, Registry, RegistryDefaults +from ..registry import ( + CollectionArgType, + CollectionSummary, + CollectionType, + DatasetTypeError, + Registry, + RegistryDefaults, +) from ..registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults from ..remote_butler import RemoteButler +from ._collection_args import convert_collection_arg_to_glob_string_list class RemoteButlerRegistry(Registry): @@ -146,10 +154,29 @@ def findDataset( datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: - # There is an implementation of find_dataset on RemoteButler, but the - # definition of the collections parameter is incompatible and timespans - # aren't supported yet. - raise NotImplementedError() + # Components are supported by Butler.find_dataset, but are required to + # raise an exception in registry.findDataset (see + # RegistryTests.testComponentLookups). Apparently the registry version + # used to support components, but at some point a decision was made to + # draw a hard architectural boundary between registry and datastore so + # that registry is no longer allowed to know about components. + if _is_component_dataset_type(datasetType): + raise DatasetTypeError( + "Component dataset types are not supported;" + " use DatasetRef or DatasetType methods to obtain components from parents instead." + ) + + if collections is not None: + collections = convert_collection_arg_to_glob_string_list(collections) + + return self._butler.find_dataset( + datasetType, + dataId, + collections=collections, + timespan=timespan, + datastore_records=datastore_records, + **kwargs, + ) def insertDatasets( self, @@ -310,3 +337,14 @@ def storageClasses(self) -> StorageClassFactory: @storageClasses.setter def storageClasses(self, value: StorageClassFactory) -> None: raise NotImplementedError() + + +def _is_component_dataset_type(dataset_type: DatasetType | str) -> bool: + """Return true if the given dataset type refers to a component.""" + if isinstance(dataset_type, DatasetType): + return dataset_type.component() is not None + elif isinstance(dataset_type, str): + parent, component = DatasetType.splitDatasetTypeName(dataset_type) + return component is not None + else: + raise TypeError(f"Expected DatasetType or str, got {dataset_type!r}") diff --git a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py index 2c86cf0cf2..5b3ccf8714 100644 --- a/python/lsst/daf/butler/remote_butler/server/handlers/_external.py +++ b/python/lsst/daf/butler/remote_butler/server/handlers/_external.py @@ -122,7 +122,6 @@ def get_dataset( return ref -# Not yet supported: TimeSpan is not yet a pydantic model. @external_router.post( "/v1/find_dataset/{dataset_type}", summary="Retrieve this dataset definition from collection, dataset type, and dataId", diff --git a/python/lsst/daf/butler/tests/hybrid_butler_registry.py b/python/lsst/daf/butler/tests/hybrid_butler_registry.py index 0826f184b2..6dae1a67ca 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler_registry.py +++ b/python/lsst/daf/butler/tests/hybrid_butler_registry.py @@ -153,7 +153,7 @@ def findDataset( datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: - return self._direct.findDataset( + return self._remote.findDataset( datasetType, dataId, collections=collections,