From eb89aee10d4b048695ed359b5385dd9796049938 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 26 Oct 2023 15:13:00 -0700 Subject: [PATCH 01/13] Add DirectButler.get_dataset_type API This replaces butler.registry.getDatasetType --- python/lsst/daf/butler/__init__.py | 2 +- python/lsst/daf/butler/_butler.py | 26 +++++++++++++++++++ python/lsst/daf/butler/direct_butler.py | 15 ++++++----- python/lsst/daf/butler/script/ingest_files.py | 2 +- tests/test_butler.py | 6 ++--- tests/test_testRepo.py | 4 +-- 6 files changed, 42 insertions(+), 13 deletions(-) diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index c346ca7b87..c6ced36970 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -79,7 +79,7 @@ # Do not import or lift symbols from 'server' or 'server_models'. # Import the registry subpackage directly for other symbols. -from .registry import CollectionSearch, CollectionType, Registry, RegistryConfig +from .registry import CollectionSearch, CollectionType, MissingDatasetTypeError, Registry, RegistryConfig from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend from .version import * diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index f9f32fbbf3..fbfa19d770 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -772,6 +772,32 @@ def getURI( """ raise NotImplementedError() + @abstractmethod + def get_dataset_type(self, name: str) -> DatasetType: + """Get the `DatasetType`. + + Parameters + ---------- + name : `str` + Name of the type. + + Returns + ------- + type : `DatasetType` + The `DatasetType` associated with the given name. + + Raises + ------ + lsst.daf.butler.MissingDatasetTypeError + Raised if the requested dataset type has not been registered. + + Notes + ----- + This method handles component dataset types automatically, though most + other operations do not. + """ + raise NotImplementedError() + @abstractmethod def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index 6e7ad20cb4..1b73561b8e 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -228,7 +228,7 @@ def __init__( def _retrieve_dataset_type(self, name: str) -> DatasetType | None: """Return DatasetType defined in registry given dataset type name.""" try: - return self._registry.getDatasetType(name) + return self.get_dataset_type(name) except MissingDatasetTypeError: return None @@ -369,11 +369,11 @@ def _standardizeArgs( if isinstance(datasetRefOrType, DatasetType): externalDatasetType = datasetRefOrType else: - internalDatasetType = self._registry.getDatasetType(datasetRefOrType) + internalDatasetType = self.get_dataset_type(datasetRefOrType) # Check that they are self-consistent if externalDatasetType is not None: - internalDatasetType = self._registry.getDatasetType(externalDatasetType.name) + internalDatasetType = self.get_dataset_type(externalDatasetType.name) if externalDatasetType != internalDatasetType: # We can allow differences if they are compatible, depending # on whether this is a get or a put. A get requires that @@ -1318,6 +1318,9 @@ def getURI( ) return primary + def get_dataset_type(self, name: str) -> DatasetType: + return self._registry.getDatasetType(name) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], @@ -1877,7 +1880,7 @@ def transfer_from( newly_registered_dataset_types.add(datasetType) else: # If the dataset type is missing, let it fail immediately. - target_dataset_type = self._registry.getDatasetType(datasetType.name) + target_dataset_type = self.get_dataset_type(datasetType.name) if target_dataset_type != datasetType: raise ConflictingDefinitionError( "Source butler dataset type differs from definition" @@ -1994,7 +1997,7 @@ def validateConfiguration( ) -> None: # Docstring inherited. if datasetTypeNames: - datasetTypes = [self._registry.getDatasetType(name) for name in datasetTypeNames] + datasetTypes = [self.get_dataset_type(name) for name in datasetTypeNames] else: datasetTypes = list(self._registry.queryDatasetTypes()) @@ -2064,7 +2067,7 @@ def validateConfiguration( pass else: try: - self._registry.getDatasetType(key.name) + self.get_dataset_type(key.name) except KeyError: if logFailures: _LOG.critical( diff --git a/python/lsst/daf/butler/script/ingest_files.py b/python/lsst/daf/butler/script/ingest_files.py index e4e645229b..aa3f2b1aac 100644 --- a/python/lsst/daf/butler/script/ingest_files.py +++ b/python/lsst/daf/butler/script/ingest_files.py @@ -107,7 +107,7 @@ def ingest_files( # Create the butler with the relevant run attached. butler = Butler.from_config(repo, run=run) - datasetType = butler.registry.getDatasetType(dataset_type) + datasetType = butler.get_dataset_type(dataset_type) # Convert the k=v strings into a dataId dict. universe = butler.dimensions diff --git a/tests/test_butler.py b/tests/test_butler.py index e13d393b52..5b68ebfce5 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -822,7 +822,7 @@ def testPytypePutCoercion(self) -> None: # Check that the put still works if a DatasetType is given with # a definition matching this python type. - registry_type = butler.registry.getDatasetType(datasetTypeName) + registry_type = butler.get_dataset_type(datasetTypeName) this_type = DatasetType(datasetTypeName, registry_type.dimensions, "StructuredDataDictJson") metric2_ref = butler.put(test_dict, this_type, dataId=dataId, visit=425) self.assertEqual(metric2_ref.datasetType, registry_type) @@ -1708,7 +1708,7 @@ def testPytypeCoercion(self) -> None: metric = butler.get(datasetTypeName, dataId=dataId) self.assertEqual(get_full_type_name(metric), "lsst.daf.butler.tests.MetricsExample") - datasetType_ori = butler.registry.getDatasetType(datasetTypeName) + datasetType_ori = butler.get_dataset_type(datasetTypeName) self.assertEqual(datasetType_ori.storageClass.name, "StructuredDataNoComponents") # Now need to hack the registry dataset type definition. @@ -1725,7 +1725,7 @@ def testPytypeCoercion(self) -> None: # Force reset of dataset type cache butler.registry.refresh() - datasetType_new = butler.registry.getDatasetType(datasetTypeName) + datasetType_new = butler.get_dataset_type(datasetTypeName) self.assertEqual(datasetType_new.name, datasetType_ori.name) self.assertEqual(datasetType_new.storageClass.name, "StructuredDataNoComponentsModel") diff --git a/tests/test_testRepo.py b/tests/test_testRepo.py index 71f40e7e6f..50c42283b7 100644 --- a/tests/test_testRepo.py +++ b/tests/test_testRepo.py @@ -176,8 +176,8 @@ def testAddDatasetType(self): # Testing the DatasetType objects is not practical, because all tests # need a DimensionUniverse. So just check that we have the dataset # types we expect. - self.butler.registry.getDatasetType("DataType1") - self.butler.registry.getDatasetType("DataType2") + self.butler.get_dataset_type("DataType1") + self.butler.get_dataset_type("DataType2") with self.assertRaises(ValueError): addDatasetType(self.butler, "DataType3", {"4thDimension"}, "NumpyArray") From 73472aa6f363479af8a1bf7638538e919d551d02 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 26 Oct 2023 15:39:42 -0700 Subject: [PATCH 02/13] Add get_dataset_type to RemoteButler --- .../butler/remote_butler/_remote_butler.py | 14 +++++++++----- .../butler/remote_butler/server/_server.py | 19 ++++++++++++++++++- tests/test_server.py | 9 ++++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index a9a0273618..bc0aa2d9f0 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -41,7 +41,7 @@ from .._config import Config from .._dataset_existence import DatasetExistence from .._dataset_ref import DatasetIdGenEnum, DatasetRef -from .._dataset_type import DatasetType +from .._dataset_type import DatasetType, SerializedDatasetType from .._deferredDatasetHandle import DeferredDatasetHandle from .._file_dataset import FileDataset from .._limited_butler import LimitedButler @@ -100,10 +100,6 @@ def dimensions(self) -> DimensionUniverse: self._dimensions = DimensionUniverse(config) return self._dimensions - def getDatasetType(self, name: str) -> DatasetType: - # Docstring inherited. - raise NotImplementedError() - def transaction(self) -> AbstractContextManager[None]: """Will always raise NotImplementedError. Transactions are not supported by RemoteButler. @@ -179,6 +175,14 @@ def getURI( # Docstring inherited. raise NotImplementedError() + def get_dataset_type(self, name: str) -> DatasetType: + # In future implementation this should directly access the cache + # and only go to the server if the dataset type is not known. + path = f"dataset_type/{name}" + response = self._client.get(self._get_url(path)) + response.raise_for_status() + return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 3be9348223..51bf01a4e6 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -35,7 +35,7 @@ from fastapi import Depends, FastAPI from fastapi.middleware.gzip import GZipMiddleware -from lsst.daf.butler import Butler +from lsst.daf.butler import Butler, SerializedDatasetType from ._factory import Factory @@ -61,3 +61,20 @@ def get_dimension_universe(factory: Factory = Depends(factory_dependency)) -> di """Allow remote client to get dimensions definition.""" butler = factory.create_butler() return butler.dimensions.dimensionConfig.toDict() + + +@app.get( + "/butler/v1/dataset_type/{dataset_type_name}", + summary="Retrieve this dataset type definition.", + response_model=SerializedDatasetType, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def get_dataset_type( + dataset_type_name: str, factory: Factory = Depends(factory_dependency) +) -> SerializedDatasetType: + """Return the dataset type.""" + butler = factory.create_butler() + datasetType = butler.get_dataset_type(dataset_type_name) + return datasetType.to_simple() diff --git a/tests/test_server.py b/tests/test_server.py index 401e0126dd..394ceae728 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -66,7 +66,7 @@ def setUpClass(cls): cls.root = makeTestTempDir(TESTDIR) cls.repo = MetricTestRepo(root=cls.root, configFile=os.path.join(TESTDIR, "config/basic/butler.yaml")) # Override the server's Butler initialization to point at our test repo - server_butler = Butler.from_config(cls.root) + server_butler = Butler.from_config(cls.root, writeable=True) def create_factory_dependency(): return Factory(butler=server_butler) @@ -77,6 +77,9 @@ def create_factory_dependency(): cls.client = TestClient(app) cls.butler = _make_remote_butler(cls.client) + # Populate the test server. + server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + @classmethod def tearDownClass(cls): del app.dependency_overrides[factory_dependency] @@ -91,6 +94,10 @@ def test_remote_butler(self): universe = self.butler.dimensions self.assertEqual(universe.namespace, "daf_butler") + def test_get_dataset_type(self): + bias_type = self.butler.get_dataset_type("bias") + self.assertEqual(bias_type.name, "bias") + if __name__ == "__main__": unittest.main() From 46c9cee7c62e22c4e8076852c96a2dde101928d8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 26 Oct 2023 16:32:12 -0700 Subject: [PATCH 03/13] Add DirectButler.find_dataset --- python/lsst/daf/butler/__init__.py | 11 ++- python/lsst/daf/butler/_butler.py | 80 ++++++++++++++++++- python/lsst/daf/butler/direct_butler.py | 22 ++++- python/lsst/daf/butler/registry/_registry.py | 8 +- .../butler/remote_butler/_remote_butler.py | 78 +++++++++++++++++- .../butler/remote_butler/server/__init__.py | 1 + .../butler/remote_butler/server/_server.py | 53 +++++++++++- .../remote_butler/server/_server_models.py | 11 +++ tests/test_butler.py | 8 +- tests/test_server.py | 29 ++++++- tests/test_simpleButler.py | 6 +- 11 files changed, 289 insertions(+), 18 deletions(-) diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index c6ced36970..e40d97d505 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -79,7 +79,16 @@ # Do not import or lift symbols from 'server' or 'server_models'. # Import the registry subpackage directly for other symbols. -from .registry import CollectionSearch, CollectionType, MissingDatasetTypeError, Registry, RegistryConfig +from .registry import ( + CollectionArgType, + CollectionSearch, + CollectionType, + MissingCollectionError, + MissingDatasetTypeError, + NoDefaultCollectionError, + Registry, + RegistryConfig, +) from .transfers import RepoExportContext, YamlRepoExportBackend, YamlRepoImportBackend from .version import * diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index fbfa19d770..5ee27b8764 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -48,9 +48,10 @@ from ._file_dataset import FileDataset from ._limited_butler import LimitedButler from ._storage_class import StorageClass +from ._timespan import Timespan from .datastore import DatasetRefURIs, Datastore from .dimensions import DataId, DimensionConfig -from .registry import Registry, RegistryConfig, _RegistryFactory +from .registry import CollectionArgType, Registry, RegistryConfig, _RegistryFactory from .repo_relocation import BUTLER_ROOT_TAG from .transfers import RepoExportContext @@ -798,6 +799,83 @@ def get_dataset_type(self, name: str) -> DatasetType: """ raise NotImplementedError() + @abstractmethod + def find_dataset( + self, + datasetType: DatasetType | str, + dataId: DataId | None = None, + *, + collections: CollectionArgType | None = None, + timespan: Timespan | None = None, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + """Find a dataset given its `DatasetType` and data ID. + + This can be used to obtain a `DatasetRef` that permits the dataset to + be read from a `Datastore`. If the dataset is a component and can not + be found using the provided dataset type, a dataset ref for the parent + will be returned instead but with the correct dataset type. + + Parameters + ---------- + datasetType : `DatasetType` or `str` + A `DatasetType` or the name of one. If this is a `DatasetType` + instance, its storage class will be respected and propagated to + the output, even if it differs from the dataset type definition + in the registry, as long as the storage classes are convertible. + dataId : `dict` or `DataCoordinate`, optional + A `dict`-like object containing the `Dimension` links that identify + the dataset within a collection. + collections : collection expression, optional + An expression that fully or partially identifies the collections to + search for the dataset; see + :ref:`daf_butler_collection_expressions` for more information. + Defaults to ``self.defaults.collections``. + timespan : `Timespan`, optional + A timespan that the validity range of the dataset must overlap. + If not provided, any `~CollectionType.CALIBRATION` collections + matched by the ``collections`` argument will not be searched. + **kwargs + Additional keyword arguments passed to + `DataCoordinate.standardize` to convert ``dataId`` to a true + `DataCoordinate` or augment an existing one. + + Returns + ------- + ref : `DatasetRef` + A reference to the dataset, or `None` if no matching Dataset + was found. + + Raises + ------ + lsst.daf.butler.NoDefaultCollectionError + Raised if ``collections`` is `None` and + ``self.collections`` is `None`. + LookupError + Raised if one or more data ID keys are missing. + lsst.daf.butler.registry.MissingDatasetTypeError + Raised if the dataset type does not exist. + lsst.daf.butler.MissingCollectionError + Raised if any of ``collections`` does not exist in the registry. + + Notes + ----- + This method simply returns `None` and does not raise an exception even + when the set of collections searched is intrinsically incompatible with + the dataset type, e.g. if ``datasetType.isCalibration() is False``, but + only `~CollectionType.CALIBRATION` collections are being searched. + This may make it harder to debug some lookup failures, but the behavior + is intentional; we consider it more important that failed searches are + reported consistently, regardless of the reason, and that adding + additional collections that do not contain a match to the search path + never changes the behavior. + + This method handles component dataset types automatically, though most + other registry operations do not. + """ + raise NotImplementedError() + @abstractmethod def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index 1b73561b8e..701433af88 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -76,6 +76,7 @@ ) from .progress import Progress from .registry import ( + CollectionArgType, CollectionType, ConflictingDefinitionError, DataIdError, @@ -846,7 +847,7 @@ def _findDatasetRef( ) # Always lookup the DatasetRef, even if one is given, to ensure it is # present in the current collection. - ref = self._registry.findDataset( + ref = self.find_dataset( datasetType, dataId, collections=collections, @@ -1321,6 +1322,25 @@ def getURI( def get_dataset_type(self, name: str) -> DatasetType: return self._registry.getDatasetType(name) + def find_dataset( + self, + datasetType: DatasetType | str, + dataId: DataId | None = None, + *, + collections: CollectionArgType | None = None, + timespan: Timespan | None = None, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + return self._registry.findDataset( + datasetType, + dataId, + collections=collections, + timespan=timespan, + dataset_records=datastore_records, + **kwargs, + ) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py index 398f2479a4..2f0cb3231d 100644 --- a/python/lsst/daf/butler/registry/_registry.py +++ b/python/lsst/daf/butler/registry/_registry.py @@ -27,7 +27,7 @@ from __future__ import annotations -__all__ = ("Registry",) +__all__ = ("Registry", "CollectionArgType") import contextlib import logging @@ -35,7 +35,7 @@ from abc import ABC, abstractmethod from collections.abc import Iterable, Iterator, Mapping, Sequence from types import EllipsisType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeAlias from .._dataset_association import DatasetAssociation from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef @@ -64,7 +64,9 @@ _LOG = logging.getLogger(__name__) # TYpe alias for `collections` arguments. -CollectionArgType = str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard +CollectionArgType: TypeAlias = ( + str | re.Pattern | Iterable[str | re.Pattern] | EllipsisType | CollectionWildcard +) class Registry(ABC): diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index bc0aa2d9f0..e5941b8fa3 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -40,17 +40,20 @@ from .._butler_config import ButlerConfig from .._config import Config from .._dataset_existence import DatasetExistence -from .._dataset_ref import DatasetIdGenEnum, DatasetRef +from .._dataset_ref import DatasetIdGenEnum, DatasetRef, SerializedDatasetRef from .._dataset_type import DatasetType, SerializedDatasetType from .._deferredDatasetHandle import DeferredDatasetHandle from .._file_dataset import FileDataset from .._limited_butler import LimitedButler from .._storage_class import StorageClass +from .._timespan import Timespan from ..datastore import DatasetRefURIs -from ..dimensions import DataId, DimensionConfig, DimensionUniverse -from ..registry import Registry, RegistryDefaults +from ..dimensions import DataCoordinate, DataId, DimensionConfig, DimensionUniverse, SerializedDataCoordinate +from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults +from ..registry.wildcards import CollectionWildcard from ..transfers import RepoExportContext from ._config import RemoteButlerConfigModel +from .server import FindDatasetModel class RemoteButler(Butler): @@ -100,6 +103,39 @@ def dimensions(self) -> DimensionUniverse: self._dimensions = DimensionUniverse(config) return self._dimensions + def _simplify_dataId( + self, dataId: DataId | None, **kwargs: dict[str, int | str] + ) -> SerializedDataCoordinate | None: + """Take a generic Data ID and convert it to a serializable form. + + Parameters + ---------- + dataId : `dict`, `None`, `DataCoordinate` + The data ID to serialize. + **kwargs : `dict` + Additional values that should be included if this is not + a `DataCoordinate`. + + Returns + ------- + data_id : `SerializedDataCoordinate` or `None` + A serializable form. + """ + if dataId is None and not kwargs: + return None + if isinstance(dataId, DataCoordinate): + return dataId.to_simple() + + if dataId is None: + data_id = kwargs + elif kwargs: + # Change variable because DataId is immutable and mypy complains. + data_id = dict(dataId) + data_id.update(kwargs) + + # Assume we can treat it as a dict. + return SerializedDataCoordinate(dataId=data_id) + def transaction(self) -> AbstractContextManager[None]: """Will always raise NotImplementedError. Transactions are not supported by RemoteButler. @@ -183,6 +219,42 @@ def get_dataset_type(self, name: str) -> DatasetType: response.raise_for_status() return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) + def find_dataset( + self, + datasetType: DatasetType | str, + dataId: DataId | None = None, + *, + collections: CollectionArgType | None = None, + timespan: Timespan | None = None, + datastore_records: bool = False, + **kwargs: Any, + ) -> DatasetRef | None: + if collections is None: + if not self.collections: + raise NoDefaultCollectionError( + "No collections provided to find_dataset, and no defaults from butler construction." + ) + collections = self.collections + # Temporary hack. Assume strings for collections. In future + # want to construct CollectionWildcard and filter it through collection + # cache to generate list of collection names. + wildcards = CollectionWildcard.from_expression(collections) + + if isinstance(datasetType, DatasetType): + datasetType = datasetType.name + + query = FindDatasetModel( + dataId=self._simplify_dataId(dataId, **kwargs), collections=wildcards.strings + ) + + path = f"find_dataset/{datasetType}" + response = self._client.post( + self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True) + ) + response.raise_for_status() + + return DatasetRef.from_simple(SerializedDatasetRef(**response.json()), universe=self.dimensions) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/remote_butler/server/__init__.py b/python/lsst/daf/butler/remote_butler/server/__init__.py index d63badaf11..93c9018bc4 100644 --- a/python/lsst/daf/butler/remote_butler/server/__init__.py +++ b/python/lsst/daf/butler/remote_butler/server/__init__.py @@ -27,3 +27,4 @@ from ._factory import * from ._server import * +from ._server_models import * diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 51bf01a4e6..b791651c47 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -35,9 +35,16 @@ from fastapi import Depends, FastAPI from fastapi.middleware.gzip import GZipMiddleware -from lsst.daf.butler import Butler, SerializedDatasetType +from lsst.daf.butler import ( + Butler, + DataCoordinate, + SerializedDataCoordinate, + SerializedDatasetRef, + SerializedDatasetType, +) from ._factory import Factory +from ._server_models import FindDatasetModel BUTLER_ROOT = "ci_hsc_gen3/DATA" @@ -56,6 +63,26 @@ def factory_dependency() -> Factory: return Factory(butler=_make_global_butler()) +def unpack_dataId(butler: Butler, data_id: SerializedDataCoordinate | None) -> DataCoordinate | None: + """Convert the serialized dataId back to full DataCoordinate. + + Parameters + ---------- + butler : `lsst.daf.butler.Butler` + The butler to use for registry and universe. + data_id : `SerializedDataCoordinate` or `None` + The serialized form. + + Returns + ------- + dataId : `DataCoordinate` or `None` + The DataId usable by registry. + """ + if data_id is None: + return None + return DataCoordinate.from_simple(data_id, registry=butler.registry) + + @app.get("/butler/v1/universe", response_model=dict[str, Any]) def get_dimension_universe(factory: Factory = Depends(factory_dependency)) -> dict[str, Any]: """Allow remote client to get dimensions definition.""" @@ -78,3 +105,27 @@ def get_dataset_type( butler = factory.create_butler() datasetType = butler.get_dataset_type(dataset_type_name) return datasetType.to_simple() + + +# Not yet supported: TimeSpan is not yet a pydantic model. +# collections parameter assumes client-side has resolved regexes. +@app.post( + "/butler/v1/find_dataset/{dataset_type}", + summary="Retrieve this dataset definition from collection, dataset type, and dataId", + response_model=SerializedDatasetRef, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def find_dataset( + dataset_type: str, + query: FindDatasetModel, + factory: Factory = Depends(factory_dependency), +) -> SerializedDatasetRef | None: + collection_query = query.collections if query.collections else None + + butler = factory.create_butler() + ref = butler.find_dataset( + dataset_type, dataId=unpack_dataId(butler, query.dataId), collections=collection_query + ) + return ref.to_simple() if ref else None diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py index 1c34747e33..686a9ad571 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server_models.py +++ b/python/lsst/daf/butler/remote_butler/server/_server_models.py @@ -26,3 +26,14 @@ # along with this program. If not, see . """Models used for client/server communication.""" + +__all__ = ["FindDatasetModel"] + +from lsst.daf.butler import SerializedDataCoordinate + +from ..._compat import _BaseModelCompat + + +class FindDatasetModel(_BaseModelCompat): + dataId: SerializedDataCoordinate + collections: list[str] diff --git a/tests/test_butler.py b/tests/test_butler.py index 5b68ebfce5..04eaa2a4ba 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -442,7 +442,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir ) self.assertEqual(count, stop) - compRef = butler.registry.findDataset(compNameS, dataId, collections=butler.collections) + compRef = butler.find_dataset(compNameS, dataId, collections=butler.collections) assert compRef is not None summary = butler.get(compRef) self.assertEqual(summary, metric.summary) @@ -928,7 +928,7 @@ def testIngest(self) -> None: datasets[0].refs = [ cast( DatasetRef, - butler.registry.findDataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), + butler.find_dataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), ) for ref in datasets[0].refs ] @@ -938,7 +938,7 @@ def testIngest(self) -> None: for ref in dataset.refs: # Create a dict from the dataId to drop the records. new_data_id = {str(k): v for k, v in ref.dataId.items()} - new_ref = butler.registry.findDataset(ref.datasetType, new_data_id, collections=ref.run) + new_ref = butler.find_dataset(ref.datasetType, new_data_id, collections=ref.run) assert new_ref is not None self.assertFalse(new_ref.dataId.hasRecords()) refs.append(new_ref) @@ -1115,7 +1115,7 @@ def testTransaction(self) -> None: with self.assertRaises(LookupError, msg=f"Check can't get by {datasetTypeName} and {dataId}"): butler.get(datasetTypeName, dataId) # Also check explicitly if Dataset entry is missing - self.assertIsNone(butler.registry.findDataset(datasetType, dataId, collections=butler.collections)) + self.assertIsNone(butler.find_dataset(datasetType, dataId, collections=butler.collections)) # Direct retrieval should not find the file in the Datastore with self.assertRaises(FileNotFoundError, msg=f"Check {ref} can't be retrieved directly"): butler.get(ref) diff --git a/tests/test_server.py b/tests/test_server.py index 394ceae728..8a6d230bde 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -27,6 +27,7 @@ import os.path import unittest +import uuid try: # Failing to import any of these should disable the tests. @@ -37,7 +38,8 @@ TestClient = None app = None -from lsst.daf.butler import Butler +from lsst.daf.butler import Butler, DataCoordinate, DatasetRef +from lsst.daf.butler.tests import DatastoreMock from lsst.daf.butler.tests.utils import MetricTestRepo, makeTestTempDir, removeTestTempDir TESTDIR = os.path.abspath(os.path.dirname(__file__)) @@ -68,6 +70,9 @@ def setUpClass(cls): # Override the server's Butler initialization to point at our test repo server_butler = Butler.from_config(cls.root, writeable=True) + # Not yet testing butler.get() + DatastoreMock.apply(server_butler) + def create_factory_dependency(): return Factory(butler=server_butler) @@ -79,6 +84,7 @@ def create_factory_dependency(): # Populate the test server. server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml")) + server_butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "datasets-uuid.yaml")) @classmethod def tearDownClass(cls): @@ -98,6 +104,27 @@ def test_get_dataset_type(self): bias_type = self.butler.get_dataset_type("bias") self.assertEqual(bias_type.name, "bias") + def test_find_dataset(self): + ref = self.butler.find_dataset("bias", collections="imported_g", detector=1, instrument="Cam1") + self.assertIsInstance(ref, DatasetRef) + self.assertEqual(ref.id, uuid.UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22")) + + # Try again with variation of parameters. + ref_new = self.butler.find_dataset( + "bias", + {"detector": 1}, + collections="imported_g", + instrument="Cam1", + ) + self.assertEqual(ref_new, ref) + + ref_new = self.butler.find_dataset( + ref.datasetType, + DataCoordinate.standardize(detector=1, instrument="Cam1", universe=self.butler.dimensions), + collections="imported_g", + ) + self.assertEqual(ref_new, ref) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 483d7d03ef..951652b59f 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -277,7 +277,7 @@ def testButlerGet(self): # Find the DatasetRef for a flat coll = "imported_g" - flat2g = butler.registry.findDataset( + flat2g = butler.find_dataset( "flat", instrument="Cam1", detector=2, physical_filter="Cam1-G", collections=coll ) @@ -512,7 +512,7 @@ def testRegistryDefaults(self): # input collections. butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) # Use findDataset without collections or instrument. - ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G") + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") # Do the same with Butler.get; this should ultimately invoke a lot of # the same code, so it's a bit circular, but mostly we're checking that # it works at all. @@ -583,7 +583,7 @@ def testJson(self): # input collections. butler.registry.defaults = RegistryDefaults(collections=["imported_g"]) # Use findDataset without collections or instrument. - ref = butler.registry.findDataset("flat", detector=2, physical_filter="Cam1-G") + ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G") # Transform the ref and dataset type to and from JSON # and check that it can be reconstructed properly From 87031862cb05e6dfc884d03124e6f093d0dc63f9 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 27 Oct 2023 09:57:42 -0700 Subject: [PATCH 04/13] Add Butler.get_dataset --- python/lsst/daf/butler/_butler.py | 19 +++++++++++++++++- python/lsst/daf/butler/direct_butler.py | 5 ++++- .../butler/remote_butler/_remote_butler.py | 10 +++++++++- .../butler/remote_butler/server/_server.py | 20 +++++++++++++++++++ tests/test_butler.py | 2 +- tests/test_server.py | 6 ++++++ 6 files changed, 58 insertions(+), 4 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 5ee27b8764..1cd7409efa 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -42,7 +42,7 @@ from ._butler_repo_index import ButlerRepoIndex from ._config import Config, ConfigSubset from ._dataset_existence import DatasetExistence -from ._dataset_ref import DatasetIdGenEnum, DatasetRef +from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._file_dataset import FileDataset @@ -799,6 +799,23 @@ def get_dataset_type(self, name: str) -> DatasetType: """ raise NotImplementedError() + @abstractmethod + def get_dataset(self, id: DatasetId) -> DatasetRef | None: + """Retrieve a Dataset entry. + + Parameters + ---------- + id : `DatasetId` + The unique identifier for the dataset. + + Returns + ------- + ref : `DatasetRef` or `None` + A ref to the Dataset, or `None` if no matching Dataset + was found. + """ + raise NotImplementedError() + @abstractmethod def find_dataset( self, diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index 701433af88..e9c979dbe1 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -55,7 +55,7 @@ from ._butler_config import ButlerConfig from ._config import Config from ._dataset_existence import DatasetExistence -from ._dataset_ref import DatasetIdGenEnum, DatasetRef +from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef from ._dataset_type import DatasetType from ._deferredDatasetHandle import DeferredDatasetHandle from ._exceptions import ValidationError @@ -1322,6 +1322,9 @@ def getURI( def get_dataset_type(self, name: str) -> DatasetType: return self._registry.getDatasetType(name) + def get_dataset(self, id: DatasetId) -> DatasetRef | None: + return self._registry.getDataset(id) + def find_dataset( self, datasetType: DatasetType | str, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index e5941b8fa3..3bf2bd952a 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -40,7 +40,7 @@ from .._butler_config import ButlerConfig from .._config import Config from .._dataset_existence import DatasetExistence -from .._dataset_ref import DatasetIdGenEnum, DatasetRef, SerializedDatasetRef +from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef, SerializedDatasetRef from .._dataset_type import DatasetType, SerializedDatasetType from .._deferredDatasetHandle import DeferredDatasetHandle from .._file_dataset import FileDataset @@ -219,6 +219,14 @@ def get_dataset_type(self, name: str) -> DatasetType: response.raise_for_status() return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) + def get_dataset(self, id: DatasetId) -> DatasetRef | None: + path = f"dataset/{id}" + response = self._client.get(self._get_url(path)) + response.raise_for_status() + if response.json() is None: + return None + return DatasetRef.from_simple(SerializedDatasetRef(**response.json()), universe=self.dimensions) + def find_dataset( self, datasetType: DatasetType | str, diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index b791651c47..724d2df121 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -30,6 +30,7 @@ __all__ = ("app", "factory_dependency") import logging +import uuid from functools import cache from typing import Any @@ -107,6 +108,25 @@ def get_dataset_type( return datasetType.to_simple() +@app.get( + "/butler/v1/dataset/{id}", + summary="Retrieve this dataset definition.", + response_model=SerializedDatasetRef | None, + response_model_exclude_unset=True, + response_model_exclude_defaults=True, + response_model_exclude_none=True, +) +def get_dataset(id: uuid.UUID, factory: Factory = Depends(factory_dependency)) -> SerializedDatasetRef | None: + """Return a single dataset reference.""" + butler = factory.create_butler() + ref = butler.get_dataset(id) + if ref is not None: + return ref.to_simple() + # This could raise a 404 since id is not found. The standard implementation + # get_dataset method returns without error so follow that example here. + return ref + + # Not yet supported: TimeSpan is not yet a pydantic model. # collections parameter assumes client-side has resolved regexes. @app.post( diff --git a/tests/test_butler.py b/tests/test_butler.py index 04eaa2a4ba..e307fb455f 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -384,7 +384,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> Dir with self.assertRaises(FileNotFoundError): butler.get(ref) # Registry shouldn't be able to find it by dataset_id anymore. - self.assertIsNone(butler.registry.getDataset(ref.id)) + self.assertIsNone(butler.get_dataset(ref.id)) # Do explicit registry removal since we know they are # empty diff --git a/tests/test_server.py b/tests/test_server.py index 8a6d230bde..93d3226f63 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -125,6 +125,12 @@ def test_find_dataset(self): ) self.assertEqual(ref_new, ref) + ref2 = self.butler.get_dataset(ref.id) + self.assertEqual(ref2, ref) + + # Unknown dataset should not fail. + self.assertIsNone(self.butler.get_dataset(uuid.uuid4())) + if __name__ == "__main__": unittest.main() From 04bd9b5308130e427653c0e5150cfb895d8d6501 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 27 Oct 2023 11:26:44 -0700 Subject: [PATCH 05/13] Add news fragment --- doc/changes/DM-41365.feature.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/changes/DM-41365.feature.rst diff --git a/doc/changes/DM-41365.feature.rst b/doc/changes/DM-41365.feature.rst new file mode 100644 index 0000000000..c95608d470 --- /dev/null +++ b/doc/changes/DM-41365.feature.rst @@ -0,0 +1 @@ +Added new ``Butler`` APIs migrated from registry: ``Butler.get_dataset_type()``, ``Butler.get_dataset()``, and ``Butler.find_dataset()``. From 2b6ba5a4c6e1b153a676b19c24fc282d5a0b27a7 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 27 Oct 2023 13:44:11 -0700 Subject: [PATCH 06/13] Check whether remote butler thinks it is writeable --- tests/test_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_server.py b/tests/test_server.py index 93d3226f63..3fb388eabb 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -99,6 +99,7 @@ def test_simple(self): def test_remote_butler(self): universe = self.butler.dimensions self.assertEqual(universe.namespace, "daf_butler") + self.assertFalse(self.butler.isWriteable()) def test_get_dataset_type(self): bias_type = self.butler.get_dataset_type("bias") From b22a5f3e7cbd37cb7ad2c36a61939adb161f8a0b Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 31 Oct 2023 11:48:44 -0700 Subject: [PATCH 07/13] Use data_id/dataset_type in find_dataset and simplify collections The underscores are the preferred way to add new APIs now. The collections parameter should not support wildcards so explicitly declare it should be a sequence of str. --- python/lsst/daf/butler/_butler.py | 22 +++++++++---------- python/lsst/daf/butler/direct_butler.py | 11 +++++----- .../butler/remote_butler/_remote_butler.py | 16 +++++++------- .../butler/remote_butler/server/_server.py | 2 +- .../remote_butler/server/_server_models.py | 2 +- tests/test_butler.py | 2 +- 6 files changed, 26 insertions(+), 29 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 1cd7409efa..4611e6e556 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -51,7 +51,7 @@ from ._timespan import Timespan from .datastore import DatasetRefURIs, Datastore from .dimensions import DataId, DimensionConfig -from .registry import CollectionArgType, Registry, RegistryConfig, _RegistryFactory +from .registry import Registry, RegistryConfig, _RegistryFactory from .repo_relocation import BUTLER_ROOT_TAG from .transfers import RepoExportContext @@ -819,10 +819,10 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None: @abstractmethod def find_dataset( self, - datasetType: DatasetType | str, - dataId: DataId | None = None, + dataset_type: DatasetType | str, + data_id: DataId | None = None, *, - collections: CollectionArgType | None = None, + collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, datastore_records: bool = False, **kwargs: Any, @@ -836,18 +836,16 @@ def find_dataset( Parameters ---------- - datasetType : `DatasetType` or `str` + dataset_type : `DatasetType` or `str` A `DatasetType` or the name of one. If this is a `DatasetType` instance, its storage class will be respected and propagated to the output, even if it differs from the dataset type definition in the registry, as long as the storage classes are convertible. - dataId : `dict` or `DataCoordinate`, optional + data_id : `dict` or `DataCoordinate`, optional A `dict`-like object containing the `Dimension` links that identify the dataset within a collection. - collections : collection expression, optional - An expression that fully or partially identifies the collections to - search for the dataset; see - :ref:`daf_butler_collection_expressions` for more information. + collections : `str` or `list` [`str`], optional + A an ordered list of collections to search for the dataset. Defaults to ``self.defaults.collections``. timespan : `Timespan`, optional A timespan that the validity range of the dataset must overlap. @@ -871,7 +869,7 @@ def find_dataset( ``self.collections`` is `None`. LookupError Raised if one or more data ID keys are missing. - lsst.daf.butler.registry.MissingDatasetTypeError + lsst.daf.butler.MissingDatasetTypeError Raised if the dataset type does not exist. lsst.daf.butler.MissingCollectionError Raised if any of ``collections`` does not exist in the registry. @@ -889,7 +887,7 @@ def find_dataset( never changes the behavior. This method handles component dataset types automatically, though most - other registry operations do not. + other query operations do not. """ raise NotImplementedError() diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index e9c979dbe1..b38a7a8aaf 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -76,7 +76,6 @@ ) from .progress import Progress from .registry import ( - CollectionArgType, CollectionType, ConflictingDefinitionError, DataIdError, @@ -1327,17 +1326,17 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None: def find_dataset( self, - datasetType: DatasetType | str, - dataId: DataId | None = None, + dataset_type: DatasetType | str, + data_id: DataId | None = None, *, - collections: CollectionArgType | None = None, + collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: return self._registry.findDataset( - datasetType, - dataId, + dataset_type, + data_id, collections=collections, timespan=timespan, dataset_records=datastore_records, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 3bf2bd952a..6363591f83 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -49,7 +49,7 @@ from .._timespan import Timespan from ..datastore import DatasetRefURIs from ..dimensions import DataCoordinate, DataId, DimensionConfig, DimensionUniverse, SerializedDataCoordinate -from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults +from ..registry import NoDefaultCollectionError, Registry, RegistryDefaults from ..registry.wildcards import CollectionWildcard from ..transfers import RepoExportContext from ._config import RemoteButlerConfigModel @@ -229,10 +229,10 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None: def find_dataset( self, - datasetType: DatasetType | str, - dataId: DataId | None = None, + dataset_type: DatasetType | str, + data_id: DataId | None = None, *, - collections: CollectionArgType | None = None, + collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, datastore_records: bool = False, **kwargs: Any, @@ -248,14 +248,14 @@ def find_dataset( # cache to generate list of collection names. wildcards = CollectionWildcard.from_expression(collections) - if isinstance(datasetType, DatasetType): - datasetType = datasetType.name + if isinstance(dataset_type, DatasetType): + dataset_type = dataset_type.name query = FindDatasetModel( - dataId=self._simplify_dataId(dataId, **kwargs), collections=wildcards.strings + data_id=self._simplify_dataId(data_id, **kwargs), collections=wildcards.strings ) - path = f"find_dataset/{datasetType}" + path = f"find_dataset/{dataset_type}" response = self._client.post( self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True) ) diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 724d2df121..a425d21dea 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -146,6 +146,6 @@ def find_dataset( butler = factory.create_butler() ref = butler.find_dataset( - dataset_type, dataId=unpack_dataId(butler, query.dataId), collections=collection_query + dataset_type, data_id=unpack_dataId(butler, query.data_id), collections=collection_query ) return ref.to_simple() if ref else None diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py index 686a9ad571..24a20829e6 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server_models.py +++ b/python/lsst/daf/butler/remote_butler/server/_server_models.py @@ -35,5 +35,5 @@ class FindDatasetModel(_BaseModelCompat): - dataId: SerializedDataCoordinate + data_id: SerializedDataCoordinate collections: list[str] diff --git a/tests/test_butler.py b/tests/test_butler.py index e307fb455f..f34b144d10 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -928,7 +928,7 @@ def testIngest(self) -> None: datasets[0].refs = [ cast( DatasetRef, - butler.find_dataset(ref.datasetType, dataId=ref.dataId, collections=ref.run), + butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run), ) for ref in datasets[0].refs ] From 02c54d2b65428119a5718f5c94bf6b59be6bc1e2 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 31 Oct 2023 16:28:10 -0700 Subject: [PATCH 08/13] Add support for day_obs/seq_num to Butler.find_dataset --- python/lsst/daf/butler/_butler.py | 9 +++++++-- python/lsst/daf/butler/direct_butler.py | 8 ++++++++ python/lsst/daf/butler/remote_butler/server/_server.py | 9 ++++++--- tests/test_server.py | 9 +++++++++ tests/test_simpleButler.py | 2 +- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 4611e6e556..d400957a52 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -843,7 +843,10 @@ def find_dataset( in the registry, as long as the storage classes are convertible. data_id : `dict` or `DataCoordinate`, optional A `dict`-like object containing the `Dimension` links that identify - the dataset within a collection. + the dataset within a collection. If it is a `dict` the dataId + can include dimension record values such as ``day_obs`` and + ``seq_num`` or ``full_name`` that can be used to derive the + primary dimension. collections : `str` or `list` [`str`], optional A an ordered list of collections to search for the dataset. Defaults to ``self.defaults.collections``. @@ -854,7 +857,9 @@ def find_dataset( **kwargs Additional keyword arguments passed to `DataCoordinate.standardize` to convert ``dataId`` to a true - `DataCoordinate` or augment an existing one. + `DataCoordinate` or augment an existing one. This can also include + dimension record metadata that can be used to derive a primary + dimension value. Returns ------- diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index b38a7a8aaf..25817658a6 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -1334,6 +1334,14 @@ def find_dataset( datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: + # Handle any parts of the dataID that are not using primary dimension + # keys. + if isinstance(dataset_type, str): + actual_type = self.get_dataset_type(dataset_type) + else: + actual_type = dataset_type + data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) + return self._registry.findDataset( dataset_type, data_id, diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index a425d21dea..0e036ab67a 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -144,8 +144,11 @@ def find_dataset( ) -> SerializedDatasetRef | None: collection_query = query.collections if query.collections else None + # Get the simple dict from the SerializedDataCoordinate. We do not know + # if it is a well-defined DataCoordinate or needs some massaging first. + # find_dataset will use dimension record queries if necessary. + data_id = query.data_id.dataId + butler = factory.create_butler() - ref = butler.find_dataset( - dataset_type, data_id=unpack_dataId(butler, query.data_id), collections=collection_query - ) + ref = butler.find_dataset(dataset_type, None, collections=collection_query, **data_id) return ref.to_simple() if ref else None diff --git a/tests/test_server.py b/tests/test_server.py index 3fb388eabb..05daac7e06 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -129,6 +129,15 @@ def test_find_dataset(self): ref2 = self.butler.get_dataset(ref.id) self.assertEqual(ref2, ref) + # Use detector name to find it. + ref3 = self.butler.find_dataset( + ref.datasetType, + collections="imported_g", + instrument="Cam1", + full_name="Aa", + ) + self.assertEqual(ref2, ref3) + # Unknown dataset should not fail. self.assertIsNone(self.butler.get_dataset(uuid.uuid4())) diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py index 951652b59f..e14370fc0a 100644 --- a/tests/test_simpleButler.py +++ b/tests/test_simpleButler.py @@ -278,7 +278,7 @@ def testButlerGet(self): # Find the DatasetRef for a flat coll = "imported_g" flat2g = butler.find_dataset( - "flat", instrument="Cam1", detector=2, physical_filter="Cam1-G", collections=coll + "flat", instrument="Cam1", full_name="Ab", physical_filter="Cam1-G", collections=coll ) # Create a numpy integer to check that works fine From 6c120a94d11cde9b1aa44691ee0b809d6d427025 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 1 Nov 2023 12:21:23 -0700 Subject: [PATCH 09/13] Add storage class conversion to get_dataset and find_dataset --- python/lsst/daf/butler/_butler.py | 9 +++++++- python/lsst/daf/butler/direct_butler.py | 15 +++++++++--- .../butler/remote_butler/_remote_butler.py | 23 +++++++++++++++---- .../butler/remote_butler/server/_server.py | 10 +++++--- .../remote_butler/server/_server_models.py | 1 + tests/test_server.py | 14 ++++++++++- 6 files changed, 60 insertions(+), 12 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index d400957a52..20ef6648b5 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -800,13 +800,16 @@ def get_dataset_type(self, name: str) -> DatasetType: raise NotImplementedError() @abstractmethod - def get_dataset(self, id: DatasetId) -> DatasetRef | None: + def get_dataset(self, id: DatasetId, storage_class: str | StorageClass | None) -> DatasetRef | None: """Retrieve a Dataset entry. Parameters ---------- id : `DatasetId` The unique identifier for the dataset. + storage_class : `str` or `StorageClass` or `None` + A storage class to use when creating the returned entry. If given + it must be compatible with the default storage class. Returns ------- @@ -824,6 +827,7 @@ def find_dataset( *, collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -854,6 +858,9 @@ def find_dataset( A timespan that the validity range of the dataset must overlap. If not provided, any `~CollectionType.CALIBRATION` collections matched by the ``collections`` argument will not be searched. + storage_class : `str` or `StorageClass` or `None` + A storage class to use when creating the returned entry. If given + it must be compatible with the default storage class. **kwargs Additional keyword arguments passed to `DataCoordinate.standardize` to convert ``dataId`` to a true diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index 25817658a6..bd85516672 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -1321,8 +1321,13 @@ def getURI( def get_dataset_type(self, name: str) -> DatasetType: return self._registry.getDatasetType(name) - def get_dataset(self, id: DatasetId) -> DatasetRef | None: - return self._registry.getDataset(id) + def get_dataset( + self, id: DatasetId, storage_class: str | StorageClass | None = None + ) -> DatasetRef | None: + ref = self._registry.getDataset(id) + if ref is not None and storage_class: + ref = ref.overrideStorageClass(storage_class) + return ref def find_dataset( self, @@ -1331,6 +1336,7 @@ def find_dataset( *, collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -1342,7 +1348,7 @@ def find_dataset( actual_type = dataset_type data_id, kwargs = self._rewrite_data_id(data_id, actual_type, **kwargs) - return self._registry.findDataset( + ref = self._registry.findDataset( dataset_type, data_id, collections=collections, @@ -1350,6 +1356,9 @@ def find_dataset( dataset_records=datastore_records, **kwargs, ) + if ref is not None and storage_class is not None: + ref = ref.overrideStorageClass(storage_class) + return ref def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 6363591f83..8ba367bddf 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -219,9 +219,18 @@ def get_dataset_type(self, name: str) -> DatasetType: response.raise_for_status() return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) - def get_dataset(self, id: DatasetId) -> DatasetRef | None: + def get_dataset( + self, id: DatasetId, storage_class: str | StorageClass | None = None + ) -> DatasetRef | None: path = f"dataset/{id}" - response = self._client.get(self._get_url(path)) + if isinstance(storage_class, StorageClass): + storage_class_name = storage_class.name + elif storage_class: + storage_class_name = storage_class + params: dict[str, str] = {} + if storage_class: + params["storage_class"] = storage_class_name + response = self._client.get(self._get_url(path), params=params) response.raise_for_status() if response.json() is None: return None @@ -234,6 +243,7 @@ def find_dataset( *, collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, + storage_class: str | StorageClass | None = None, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -251,13 +261,18 @@ def find_dataset( if isinstance(dataset_type, DatasetType): dataset_type = dataset_type.name + if isinstance(storage_class, StorageClass): + storage_class = storage_class.name + query = FindDatasetModel( - data_id=self._simplify_dataId(data_id, **kwargs), collections=wildcards.strings + data_id=self._simplify_dataId(data_id, **kwargs), + collections=wildcards.strings, + storage_class=storage_class, ) path = f"find_dataset/{dataset_type}" response = self._client.post( - self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True) + self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True, exclude_defaults=True) ) response.raise_for_status() diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 0e036ab67a..92798b2628 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -116,10 +116,12 @@ def get_dataset_type( response_model_exclude_defaults=True, response_model_exclude_none=True, ) -def get_dataset(id: uuid.UUID, factory: Factory = Depends(factory_dependency)) -> SerializedDatasetRef | None: +def get_dataset( + id: uuid.UUID, storage_class: str | None = None, factory: Factory = Depends(factory_dependency) +) -> SerializedDatasetRef | None: """Return a single dataset reference.""" butler = factory.create_butler() - ref = butler.get_dataset(id) + ref = butler.get_dataset(id, storage_class=storage_class) if ref is not None: return ref.to_simple() # This could raise a 404 since id is not found. The standard implementation @@ -150,5 +152,7 @@ def find_dataset( data_id = query.data_id.dataId butler = factory.create_butler() - ref = butler.find_dataset(dataset_type, None, collections=collection_query, **data_id) + ref = butler.find_dataset( + dataset_type, None, collections=collection_query, storage_class=query.storage_class, **data_id + ) return ref.to_simple() if ref else None diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py index 24a20829e6..d9200976b1 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server_models.py +++ b/python/lsst/daf/butler/remote_butler/server/_server_models.py @@ -37,3 +37,4 @@ class FindDatasetModel(_BaseModelCompat): data_id: SerializedDataCoordinate collections: list[str] + storage_class: str | None diff --git a/tests/test_server.py b/tests/test_server.py index 05daac7e06..4622a5a203 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -38,7 +38,7 @@ TestClient = None app = None -from lsst.daf.butler import Butler, DataCoordinate, DatasetRef +from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, StorageClassFactory from lsst.daf.butler.tests import DatastoreMock from lsst.daf.butler.tests.utils import MetricTestRepo, makeTestTempDir, removeTestTempDir @@ -64,6 +64,8 @@ class ButlerClientServerTestCase(unittest.TestCase): @classmethod def setUpClass(cls): + cls.storageClassFactory = StorageClassFactory() + # First create a butler and populate it. cls.root = makeTestTempDir(TESTDIR) cls.repo = MetricTestRepo(root=cls.root, configFile=os.path.join(TESTDIR, "config/basic/butler.yaml")) @@ -106,6 +108,8 @@ def test_get_dataset_type(self): self.assertEqual(bias_type.name, "bias") def test_find_dataset(self): + storage_class = self.storageClassFactory.getStorageClass("Exposure") + ref = self.butler.find_dataset("bias", collections="imported_g", detector=1, instrument="Cam1") self.assertIsInstance(ref, DatasetRef) self.assertEqual(ref.id, uuid.UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22")) @@ -123,6 +127,7 @@ def test_find_dataset(self): ref.datasetType, DataCoordinate.standardize(detector=1, instrument="Cam1", universe=self.butler.dimensions), collections="imported_g", + storage_class=storage_class, ) self.assertEqual(ref_new, ref) @@ -138,8 +143,15 @@ def test_find_dataset(self): ) self.assertEqual(ref2, ref3) + # The test datasets are all Exposure so storage class conversion + # can not be tested until we fix that. For now at least test the + # code paths. + bias = self.butler.get_dataset(ref.id, storage_class=storage_class) + self.assertEqual(bias.datasetType.storageClass, storage_class) + # Unknown dataset should not fail. self.assertIsNone(self.butler.get_dataset(uuid.uuid4())) + self.assertIsNone(self.butler.get_dataset(uuid.uuid4(), storage_class="NumpyArray")) if __name__ == "__main__": From adf86006c524e4e0cf9653cae7204754ada443e9 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 2 Nov 2023 09:39:40 -0700 Subject: [PATCH 10/13] Remove lifting of CollectionArgType --- python/lsst/daf/butler/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py index e40d97d505..8333081ddc 100644 --- a/python/lsst/daf/butler/__init__.py +++ b/python/lsst/daf/butler/__init__.py @@ -80,7 +80,6 @@ # Do not import or lift symbols from 'server' or 'server_models'. # Import the registry subpackage directly for other symbols. from .registry import ( - CollectionArgType, CollectionSearch, CollectionType, MissingCollectionError, From 694614b4125a223abcf9360dc6ea6cef0b07e84c Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 2 Nov 2023 12:02:40 -0700 Subject: [PATCH 11/13] Add missing instrument metadata to test yaml visit_system is now required but no test was triggering validation until recently. --- tests/data/registry/base.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/registry/base.yaml b/tests/data/registry/base.yaml index 799e385cfa..7b2791ece0 100644 --- a/tests/data/registry/base.yaml +++ b/tests/data/registry/base.yaml @@ -13,6 +13,7 @@ data: - name: Cam1 visit_max: 1024 + visit_system: 1 exposure_max: 512 detector_max: 4 class_name: lsst.obs.base.Instrument From 1e3d68c2a155215c755c62404ea5fdd1de110740 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 2 Nov 2023 12:03:20 -0700 Subject: [PATCH 12/13] Add dimension and datastore record retrieval for get_dataset/find_dataset Datastore records do not work in remote butler because they were never added to SerializedDatasetRef. --- python/lsst/daf/butler/_butler.py | 17 +++++++++++++- python/lsst/daf/butler/direct_butler.py | 20 +++++++++++++---- .../butler/remote_butler/_remote_butler.py | 21 ++++++++++++++++-- .../butler/remote_butler/server/_server.py | 22 ++++++++++++++++--- .../remote_butler/server/_server_models.py | 2 ++ tests/test_server.py | 8 +++++++ 6 files changed, 80 insertions(+), 10 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 20ef6648b5..f83a4ad347 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -800,7 +800,13 @@ def get_dataset_type(self, name: str) -> DatasetType: raise NotImplementedError() @abstractmethod - def get_dataset(self, id: DatasetId, storage_class: str | StorageClass | None) -> DatasetRef | None: + def get_dataset( + self, + id: DatasetId, + storage_class: str | StorageClass | None, + dimension_records: bool = False, + datastore_records: bool = False, + ) -> DatasetRef | None: """Retrieve a Dataset entry. Parameters @@ -810,6 +816,10 @@ def get_dataset(self, id: DatasetId, storage_class: str | StorageClass | None) - storage_class : `str` or `StorageClass` or `None` A storage class to use when creating the returned entry. If given it must be compatible with the default storage class. + dimension_records: `bool`, optional + If `True` the ref will be expanded and contain dimension records. + datastore_records: `bool`, optional. + If `True` the ref will contain associated datastore records. Returns ------- @@ -828,6 +838,7 @@ def find_dataset( collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, storage_class: str | StorageClass | None = None, + dimension_records: bool = False, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -861,6 +872,10 @@ def find_dataset( storage_class : `str` or `StorageClass` or `None` A storage class to use when creating the returned entry. If given it must be compatible with the default storage class. + dimension_records: `bool`, optional + If `True` the ref will be expanded and contain dimension records. + datastore_records: `bool`, optional. + If `True` the ref will contain associated datastore records. **kwargs Additional keyword arguments passed to `DataCoordinate.standardize` to convert ``dataId`` to a true diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py index bd85516672..6b70ecb1e1 100644 --- a/python/lsst/daf/butler/direct_butler.py +++ b/python/lsst/daf/butler/direct_butler.py @@ -1322,11 +1322,20 @@ def get_dataset_type(self, name: str) -> DatasetType: return self._registry.getDatasetType(name) def get_dataset( - self, id: DatasetId, storage_class: str | StorageClass | None = None + self, + id: DatasetId, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, ) -> DatasetRef | None: ref = self._registry.getDataset(id) - if ref is not None and storage_class: - ref = ref.overrideStorageClass(storage_class) + if ref is not None: + if dimension_records: + ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) + if storage_class: + ref = ref.overrideStorageClass(storage_class) + if datastore_records: + ref = self._registry.get_datastore_records(ref) return ref def find_dataset( @@ -1337,6 +1346,7 @@ def find_dataset( collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, storage_class: str | StorageClass | None = None, + dimension_records: bool = False, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -1353,9 +1363,11 @@ def find_dataset( data_id, collections=collections, timespan=timespan, - dataset_records=datastore_records, + datastore_records=datastore_records, **kwargs, ) + if ref is not None and dimension_records: + ref = ref.expanded(self._registry.expandDataId(ref.dataId, graph=ref.datasetType.dimensions)) if ref is not None and storage_class is not None: ref = ref.overrideStorageClass(storage_class) return ref diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 8ba367bddf..f66b52e1cb 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -220,14 +220,23 @@ def get_dataset_type(self, name: str) -> DatasetType: return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) def get_dataset( - self, id: DatasetId, storage_class: str | StorageClass | None = None + self, + id: DatasetId, + storage_class: str | StorageClass | None = None, + dimension_records: bool = False, + datastore_records: bool = False, ) -> DatasetRef | None: path = f"dataset/{id}" if isinstance(storage_class, StorageClass): storage_class_name = storage_class.name elif storage_class: storage_class_name = storage_class - params: dict[str, str] = {} + params: dict[str, str | bool] = { + "dimension_records": dimension_records, + "datastore_records": datastore_records, + } + if datastore_records: + raise ValueError("Datastore records can not yet be returned in client/server butler.") if storage_class: params["storage_class"] = storage_class_name response = self._client.get(self._get_url(path), params=params) @@ -244,6 +253,7 @@ def find_dataset( collections: str | Sequence[str] | None = None, timespan: Timespan | None = None, storage_class: str | StorageClass | None = None, + dimension_records: bool = False, datastore_records: bool = False, **kwargs: Any, ) -> DatasetRef | None: @@ -258,6 +268,11 @@ def find_dataset( # cache to generate list of collection names. wildcards = CollectionWildcard.from_expression(collections) + if datastore_records: + raise ValueError("Datastore records can not yet be returned in client/server butler.") + if timespan: + raise ValueError("Timespan can not yet be used in butler client/server.") + if isinstance(dataset_type, DatasetType): dataset_type = dataset_type.name @@ -268,6 +283,8 @@ def find_dataset( data_id=self._simplify_dataId(data_id, **kwargs), collections=wildcards.strings, storage_class=storage_class, + dimension_records=dimension_records, + datastore_records=datastore_records, ) path = f"find_dataset/{dataset_type}" diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 92798b2628..18f2c5e997 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -117,11 +117,20 @@ def get_dataset_type( response_model_exclude_none=True, ) def get_dataset( - id: uuid.UUID, storage_class: str | None = None, factory: Factory = Depends(factory_dependency) + id: uuid.UUID, + storage_class: str | None = None, + dimension_records: bool = False, + datastore_records: bool = False, + factory: Factory = Depends(factory_dependency), ) -> SerializedDatasetRef | None: """Return a single dataset reference.""" butler = factory.create_butler() - ref = butler.get_dataset(id, storage_class=storage_class) + ref = butler.get_dataset( + id, + storage_class=storage_class, + dimension_records=dimension_records, + datastore_records=datastore_records, + ) if ref is not None: return ref.to_simple() # This could raise a 404 since id is not found. The standard implementation @@ -153,6 +162,13 @@ def find_dataset( butler = factory.create_butler() ref = butler.find_dataset( - dataset_type, None, collections=collection_query, storage_class=query.storage_class, **data_id + dataset_type, + None, + collections=collection_query, + storage_class=query.storage_class, + timespan=None, + dimension_records=query.dimension_records, + datastore_records=query.datastore_records, + **data_id, ) return ref.to_simple() if ref else None diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py index d9200976b1..627d09abb3 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server_models.py +++ b/python/lsst/daf/butler/remote_butler/server/_server_models.py @@ -38,3 +38,5 @@ class FindDatasetModel(_BaseModelCompat): data_id: SerializedDataCoordinate collections: list[str] storage_class: str | None + dimension_records: bool = False + datastore_records: bool = False diff --git a/tests/test_server.py b/tests/test_server.py index 4622a5a203..3adb794726 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -113,6 +113,7 @@ def test_find_dataset(self): ref = self.butler.find_dataset("bias", collections="imported_g", detector=1, instrument="Cam1") self.assertIsInstance(ref, DatasetRef) self.assertEqual(ref.id, uuid.UUID("e15ab039-bc8b-4135-87c5-90902a7c0b22")) + self.assertFalse(ref.dataId.hasRecords()) # Try again with variation of parameters. ref_new = self.butler.find_dataset( @@ -120,8 +121,10 @@ def test_find_dataset(self): {"detector": 1}, collections="imported_g", instrument="Cam1", + dimension_records=True, ) self.assertEqual(ref_new, ref) + self.assertTrue(ref_new.dataId.hasRecords()) ref_new = self.butler.find_dataset( ref.datasetType, @@ -143,6 +146,11 @@ def test_find_dataset(self): ) self.assertEqual(ref2, ref3) + # Try expanded refs. + self.assertFalse(ref.dataId.hasRecords()) + expanded = self.butler.get_dataset(ref.id, dimension_records=True) + self.assertTrue(expanded.dataId.hasRecords()) + # The test datasets are all Exposure so storage class conversion # can not be tested until we fix that. For now at least test the # code paths. From 17bc74eb5fa735d853b7aac9f91f05f2c0a534e9 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 2 Nov 2023 14:54:28 -0700 Subject: [PATCH 13/13] Add experimental exception handling in client/server Demonstrate that MissingDatasetTypeError can be caught in the server and raised again in the client. --- .../daf/butler/remote_butler/_remote_butler.py | 6 +++++- .../daf/butler/remote_butler/server/_server.py | 15 ++++++++++++++- tests/test_server.py | 5 ++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index f66b52e1cb..20cff12322 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -49,7 +49,7 @@ from .._timespan import Timespan from ..datastore import DatasetRefURIs from ..dimensions import DataCoordinate, DataId, DimensionConfig, DimensionUniverse, SerializedDataCoordinate -from ..registry import NoDefaultCollectionError, Registry, RegistryDefaults +from ..registry import MissingDatasetTypeError, NoDefaultCollectionError, Registry, RegistryDefaults from ..registry.wildcards import CollectionWildcard from ..transfers import RepoExportContext from ._config import RemoteButlerConfigModel @@ -216,6 +216,10 @@ def get_dataset_type(self, name: str) -> DatasetType: # and only go to the server if the dataset type is not known. path = f"dataset_type/{name}" response = self._client.get(self._get_url(path)) + if response.status_code != httpx.codes.OK: + content = response.json() + if content["exception"] == "MissingDatasetTypeError": + raise MissingDatasetTypeError(content["detail"]) response.raise_for_status() return DatasetType.from_simple(SerializedDatasetType(**response.json()), universe=self.dimensions) diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py index 18f2c5e997..a0c84555bd 100644 --- a/python/lsst/daf/butler/remote_butler/server/_server.py +++ b/python/lsst/daf/butler/remote_butler/server/_server.py @@ -34,11 +34,13 @@ from functools import cache from typing import Any -from fastapi import Depends, FastAPI +from fastapi import Depends, FastAPI, Request from fastapi.middleware.gzip import GZipMiddleware +from fastapi.responses import JSONResponse from lsst.daf.butler import ( Butler, DataCoordinate, + MissingDatasetTypeError, SerializedDataCoordinate, SerializedDatasetRef, SerializedDatasetType, @@ -55,6 +57,17 @@ app.add_middleware(GZipMiddleware, minimum_size=1000) +@app.exception_handler(MissingDatasetTypeError) +def missing_dataset_type_exception_handler(request: Request, exc: MissingDatasetTypeError) -> JSONResponse: + # Remove the double quotes around the string form. These confuse + # the JSON serialization when single quotes are in the message. + message = str(exc).strip('"') + return JSONResponse( + status_code=404, + content={"detail": message, "exception": "MissingDatasetTypeError"}, + ) + + @cache def _make_global_butler() -> Butler: return Butler.from_config(BUTLER_ROOT) diff --git a/tests/test_server.py b/tests/test_server.py index 3adb794726..a7661023fd 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -38,7 +38,7 @@ TestClient = None app = None -from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, StorageClassFactory +from lsst.daf.butler import Butler, DataCoordinate, DatasetRef, MissingDatasetTypeError, StorageClassFactory from lsst.daf.butler.tests import DatastoreMock from lsst.daf.butler.tests.utils import MetricTestRepo, makeTestTempDir, removeTestTempDir @@ -107,6 +107,9 @@ def test_get_dataset_type(self): bias_type = self.butler.get_dataset_type("bias") self.assertEqual(bias_type.name, "bias") + with self.assertRaises(MissingDatasetTypeError): + self.butler.get_dataset_type("not_bias") + def test_find_dataset(self): storage_class = self.storageClassFactory.getStorageClass("Exposure")