Use data_id/dataset_type in find_dataset and simplify collections

The underscores are the preferred way to add new APIs now. The collections parameter should not support wildcards so explicitly declare it should be a sequence of str.
lsst · Oct 31, 2023 · b22a5f3 · b22a5f3
1 parent 2b6ba5a
commit b22a5f3
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 29 deletions.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -51,7 +51,7 @@
 from ._timespan import Timespan
 from .datastore import DatasetRefURIs, Datastore
 from .dimensions import DataId, DimensionConfig
-from .registry import CollectionArgType, Registry, RegistryConfig, _RegistryFactory
+from .registry import Registry, RegistryConfig, _RegistryFactory
 from .repo_relocation import BUTLER_ROOT_TAG
 from .transfers import RepoExportContext
 
@@ -819,10 +819,10 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None:
     @abstractmethod
     def find_dataset(
         self,
-        datasetType: DatasetType | str,
-        dataId: DataId | None = None,
+        dataset_type: DatasetType | str,
+        data_id: DataId | None = None,
         *,
-        collections: CollectionArgType | None = None,
+        collections: str | Sequence[str] | None = None,
         timespan: Timespan | None = None,
         datastore_records: bool = False,
         **kwargs: Any,
@@ -836,18 +836,16 @@ def find_dataset(
 
         Parameters
         ----------
-        datasetType : `DatasetType` or `str`
+        dataset_type : `DatasetType` or `str`
             A `DatasetType` or the name of one.  If this is a `DatasetType`
             instance, its storage class will be respected and propagated to
             the output, even if it differs from the dataset type definition
             in the registry, as long as the storage classes are convertible.
-        dataId : `dict` or `DataCoordinate`, optional
+        data_id : `dict` or `DataCoordinate`, optional
             A `dict`-like object containing the `Dimension` links that identify
             the dataset within a collection.
-        collections : collection expression, optional
-            An expression that fully or partially identifies the collections to
-            search for the dataset; see
-            :ref:`daf_butler_collection_expressions` for more information.
+        collections : `str` or `list` [`str`], optional
+            A an ordered list of collections to search for the dataset.
             Defaults to ``self.defaults.collections``.
         timespan : `Timespan`, optional
             A timespan that the validity range of the dataset must overlap.
@@ -871,7 +869,7 @@ def find_dataset(
             ``self.collections`` is `None`.
         LookupError
             Raised if one or more data ID keys are missing.
-        lsst.daf.butler.registry.MissingDatasetTypeError
+        lsst.daf.butler.MissingDatasetTypeError
             Raised if the dataset type does not exist.
         lsst.daf.butler.MissingCollectionError
             Raised if any of ``collections`` does not exist in the registry.
@@ -889,7 +887,7 @@ def find_dataset(
         never changes the behavior.
 
         This method handles component dataset types automatically, though most
-        other registry operations do not.
+        other query operations do not.
         """
         raise NotImplementedError()
 

diff --git a/python/lsst/daf/butler/direct_butler.py b/python/lsst/daf/butler/direct_butler.py
@@ -76,7 +76,6 @@
 )
 from .progress import Progress
 from .registry import (
-    CollectionArgType,
     CollectionType,
     ConflictingDefinitionError,
     DataIdError,
@@ -1327,17 +1326,17 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None:
 
     def find_dataset(
         self,
-        datasetType: DatasetType | str,
-        dataId: DataId | None = None,
+        dataset_type: DatasetType | str,
+        data_id: DataId | None = None,
         *,
-        collections: CollectionArgType | None = None,
+        collections: str | Sequence[str] | None = None,
         timespan: Timespan | None = None,
         datastore_records: bool = False,
         **kwargs: Any,
     ) -> DatasetRef | None:
         return self._registry.findDataset(
-            datasetType,
-            dataId,
+            dataset_type,
+            data_id,
             collections=collections,
             timespan=timespan,
             dataset_records=datastore_records,

diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py
@@ -49,7 +49,7 @@
 from .._timespan import Timespan
 from ..datastore import DatasetRefURIs
 from ..dimensions import DataCoordinate, DataId, DimensionConfig, DimensionUniverse, SerializedDataCoordinate
-from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults
+from ..registry import NoDefaultCollectionError, Registry, RegistryDefaults
 from ..registry.wildcards import CollectionWildcard
 from ..transfers import RepoExportContext
 from ._config import RemoteButlerConfigModel
@@ -229,10 +229,10 @@ def get_dataset(self, id: DatasetId) -> DatasetRef | None:
 
     def find_dataset(
         self,
-        datasetType: DatasetType | str,
-        dataId: DataId | None = None,
+        dataset_type: DatasetType | str,
+        data_id: DataId | None = None,
         *,
-        collections: CollectionArgType | None = None,
+        collections: str | Sequence[str] | None = None,
         timespan: Timespan | None = None,
         datastore_records: bool = False,
         **kwargs: Any,
@@ -248,14 +248,14 @@ def find_dataset(
         # cache to generate list of collection names.
         wildcards = CollectionWildcard.from_expression(collections)
 
-        if isinstance(datasetType, DatasetType):
-            datasetType = datasetType.name
+        if isinstance(dataset_type, DatasetType):
+            dataset_type = dataset_type.name
 
         query = FindDatasetModel(
-            dataId=self._simplify_dataId(dataId, **kwargs), collections=wildcards.strings
+            data_id=self._simplify_dataId(data_id, **kwargs), collections=wildcards.strings
         )
 
-        path = f"find_dataset/{datasetType}"
+        path = f"find_dataset/{dataset_type}"
         response = self._client.post(
             self._get_url(path), json=query.model_dump(mode="json", exclude_unset=True)
         )

diff --git a/python/lsst/daf/butler/remote_butler/server/_server.py b/python/lsst/daf/butler/remote_butler/server/_server.py
@@ -146,6 +146,6 @@ def find_dataset(
 
     butler = factory.create_butler()
     ref = butler.find_dataset(
-        dataset_type, dataId=unpack_dataId(butler, query.dataId), collections=collection_query
+        dataset_type, data_id=unpack_dataId(butler, query.data_id), collections=collection_query
     )
     return ref.to_simple() if ref else None
diff --git a/python/lsst/daf/butler/remote_butler/server/_server_models.py b/python/lsst/daf/butler/remote_butler/server/_server_models.py
@@ -35,5 +35,5 @@
 
 
 class FindDatasetModel(_BaseModelCompat):
-    dataId: SerializedDataCoordinate
+    data_id: SerializedDataCoordinate
     collections: list[str]
diff --git a/tests/test_butler.py b/tests/test_butler.py
@@ -928,7 +928,7 @@ def testIngest(self) -> None:
             datasets[0].refs = [
                 cast(
                     DatasetRef,
-                    butler.find_dataset(ref.datasetType, dataId=ref.dataId, collections=ref.run),
+                    butler.find_dataset(ref.datasetType, data_id=ref.dataId, collections=ref.run),
                 )
                 for ref in datasets[0].refs
             ]