lsst · timj · Dec 6, 2024 · Nov 1, 2024 · Nov 8, 2024 · Nov 26, 2024
diff --git a/doc/changes/DM-47325.feature.rst b/doc/changes/DM-47325.feature.rst
@@ -0,0 +1,5 @@
+Added two new APIs for handling Butler dataset URIs.
+``Butler.parse_dataset_uri`` parses a URI and returns the butler repository label and associated UUID.
+``Butler.get_dataset_from_uri`` will parse a URI and attempt to retrieve the ``DatasetRef``.
+URIs should be of the form IVOA identifiers as described in `DMTN-302 <https://dmtn-302.lsst.io>`_.
+Deprecated ``butler://`` URIs are still supported but should not be used in new systems.
diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -29,6 +29,9 @@
 
 __all__ = ["Butler"]
 
+import dataclasses
+import urllib.parse
+import uuid
 from abc import abstractmethod
 from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
 from contextlib import AbstractContextManager
@@ -60,6 +63,7 @@
     from ._dataset_type import DatasetType
     from ._deferredDatasetHandle import DeferredDatasetHandle
     from ._file_dataset import FileDataset
+    from ._labeled_butler_factory import LabeledButlerFactoryProtocol
     from ._storage_class import StorageClass
     from ._timespan import Timespan
     from .datastore import DatasetRefURIs
@@ -71,6 +75,19 @@
 _LOG = getLogger(__name__)
 
 
+@dataclasses.dataclass
+class ParsedButlerDatasetURI:
+    label: str
+    dataset_id: uuid.UUID
+    uri: str
+
+
+@dataclasses.dataclass
+class SpecificButlerDataset:
+    butler: Butler
+    dataset: DatasetRef | None
+
+
 class Butler(LimitedButler):  # numpydoc ignore=PR02
     """Interface for data butler and factory for Butler instances.
 
@@ -526,6 +543,105 @@ def get_known_repos(cls) -> set[str]:
         """
         return ButlerRepoIndex.get_known_repos()
 
+    @classmethod
+    def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI:
+        """Extract the butler label and dataset ID from a dataset URI.
+
+        Parameters
+        ----------
+        uri : `str`
+            The dataset URI to parse.
+
+        Returns
+        -------
+        parsed : `ParsedButlerDatasetURI`
+            The label associated with the butler repository from which this
+            dataset originates and the ID of the dataset.
+
+        Notes
+        -----
+        Supports dataset URIs of the forms
+        ``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see
+        DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is
+        deprecated and can not include ``/`` in the label string. ``ivo`` URIs
+        can include anything supported by the `Butler` constructor, including
+        paths to repositories and alias labels.
+
+            ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID
+
+        will return a label of ``/repo/main``.
+
+        This method does not attempt to check that the dataset exists in the
+        labeled butler.
+
+        Since the IVOID can be issued by any publisher to represent a Butler
+        dataset there is no validation of the path or netloc component of the
+        URI. The only requirement is that there are ``id`` and ``repo`` keys
+        in the ``ivo`` URI query component.
+        """
+        parsed = urllib.parse.urlparse(uri)
+        parsed_scheme = parsed.scheme.lower()
+        if parsed_scheme == "ivo":
+            # Do not validate the netloc or the path values.
+            qs = urllib.parse.parse_qs(parsed.query)
+            if "repo" not in qs or "id" not in qs:
+                raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.")
+            if len(qs["repo"]) != 1 or len(qs["id"]) != 1:
+                raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}")
+            label = qs["repo"][0]
+            id_ = qs["id"][0]
+        elif parsed_scheme == "butler":
+            label = parsed.netloc  # Butler label is case sensitive.
+            # Need to strip the leading /.
+            id_ = parsed.path[1:]
+        else:
+            raise ValueError(f"Unrecognized URI scheme: {uri!r}")
+        # Strip trailing/leading whitespace from label.
+        label = label.strip()
+        if not label:
+            raise ValueError(f"No butler repository label found in uri {uri!r}")
+        try:
+            dataset_id = uuid.UUID(hex=id_)
+        except Exception as e:
+            e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
+            raise
+
+        return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri)
+
+    @classmethod
+    def get_dataset_from_uri(
+        cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None
+    ) -> SpecificButlerDataset:
+        """Get the dataset associated with the given dataset URI.
+
+        Parameters
+        ----------
+        uri : `str`
+            The URI associated with a dataset.
+        factory : `LabeledButlerFactoryProtocol` or `None`, optional
+            Bound factory function that will be given the butler label
+            and receive a `Butler`. If this is not provided the label
+            will be tried directly.
+
+        Returns
+        -------
+        result : `SpecificButlerDataset`
+            The butler associated with this URI and the dataset itself.
+            The dataset can be `None` if the UUID is valid but the dataset
+            is not known to this butler.
+        """
+        parsed = cls.parse_dataset_uri(uri)
+        butler: Butler | None = None
+        if factory is not None:
+            # If the label is not recognized, it might be a path.
+            try:
+                butler = factory(parsed.label)
+            except KeyError:
+                pass
+        if butler is None:
+            butler = cls.from_config(parsed.label)
+        return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id))
+
     @abstractmethod
     def _caching_context(self) -> AbstractContextManager[None]:
         """Context manager that enables caching."""

diff --git a/python/lsst/daf/butler/_labeled_butler_factory.py b/python/lsst/daf/butler/_labeled_butler_factory.py
@@ -25,9 +25,10 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-__all__ = ("LabeledButlerFactory",)
+__all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")
 
 from collections.abc import Callable, Mapping
+from typing import Protocol
 
 from lsst.resources import ResourcePathExpression
 
@@ -42,6 +43,12 @@
 instance."""
 
 
+class LabeledButlerFactoryProtocol(Protocol):
+    """Callable to retrieve a butler from a label."""
+
+    def __call__(self, label: str) -> Butler: ...
+
+
 class LabeledButlerFactory:
     """Factory for efficiently instantiating Butler instances from the
     repository index file.  This is intended for use from long-lived services
@@ -83,6 +90,27 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
         # This may be overridden by unit tests.
         self._preload_direct_butler_cache = True
 
+    def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
+        """Create a callable factory function for generating Butler instances
+        with out needing to specify access tokans again.
+
+        Parameters
+        ----------
+        access_token : `str` or `None`
+            An optional access token to use for authentication with the Butler.
+
+        Returns
+        -------
+        bound : `LabeledButlerFactoryProtocol`
+            A callable that takes a label as input and returns a Butler
+            instance.
+        """
+
+        def create(label: str) -> Butler:
+            return self.create_butler(label=label, access_token=access_token)
+
+        return create
+
     def create_butler(self, *, label: str, access_token: str | None) -> Butler:
         """Create a Butler instance.
 

diff --git a/python/lsst_daf_butler.dist-info/METADATA b/python/lsst_daf_butler.dist-info/METADATA
@@ -0,0 +1,3 @@
+Metadata-Version: 1.0
+Name: lsst-daf-butler
+Version: g57cedf6216+76f9c43fa5
diff --git a/tests/test_simpleButler.py b/tests/test_simpleButler.py
@@ -48,13 +48,14 @@
     DatasetId,
     DatasetRef,
     DatasetType,
+    LabeledButlerFactory,
     StorageClass,
     Timespan,
 )
 from lsst.daf.butler.datastore.file_templates import FileTemplate
 from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory
 from lsst.daf.butler.tests import DatastoreMock
-from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir
+from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir
 
 try:
     from lsst.daf.butler.tests.server import create_test_server
@@ -882,10 +883,75 @@ def makeButler(self, writeable: bool = False) -> Butler:
         registryConfig = RegistryConfig(config.get("registry"))
         _RegistryFactory(registryConfig).create_from_config()
 
+        # Write the YAML file so that some tests can recreate butler from it.
+        config.dumpToUri(os.path.join(self.root, "butler.yaml"))
         butler = Butler.from_config(config, writeable=writeable)
         DatastoreMock.apply(butler)
         return butler
 
+    def test_dataset_uris(self):
+        """Test that dataset URIs can be parsed and retrieved."""
+        butler = self.makeButler(writeable=True)
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
+        butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile))
+
+        butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
+        ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
+        self.assertIsInstance(ref, DatasetRef)
+
+        # Get the butler root for the URI.
+        config_dir = butler._config["root"]
+
+        # Read it via a repo label and a path.
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file:
+            label = "test_repo"
+            index_file.write(f"{label}: {config_dir}\n")
+            index_file.flush()
+            with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}):
+                butler_factory = LabeledButlerFactory()
+                factory = butler_factory.bind(access_token=None)
+
+                for dataset_uri in (
+                    f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}",
+                    f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}%2Fbutler.yaml&id={ref.id}",
+                    f"butler://{label}/{ref.id}",
+                    f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}",
+                ):
+                    result = Butler.get_dataset_from_uri(dataset_uri)
+                    self.assertEqual(result.dataset, ref)
+                    # The returned butler needs to have the datastore mocked.
+                    DatastoreMock.apply(result.butler)
+                    dataset_id, _ = result.butler.get(result.dataset)
+                    self.assertEqual(dataset_id, ref.id)
+
+                    factory_result = Butler.get_dataset_from_uri(dataset_uri, factory=factory)
+                    self.assertEqual(factory_result.dataset, ref)
+                    # The returned butler needs to have the datastore mocked.
+                    DatastoreMock.apply(factory_result.butler)
+                    dataset_id, _ = factory_result.butler.get(factory_result.dataset)
+                    self.assertEqual(dataset_id, ref.id)
+
+                # Non existent dataset.
+                missing_id = str(ref.id).replace("2", "3")
+                result = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}")
+                self.assertIsNone(result.dataset)
+
+        # Test some failure modes.
+        for dataset_uri in (
+            "butler://label/1234",  # Bad UUID.
+            "butler://1234",  # No UUID.
+            "butler:///1234",  # No label.
+            "ivo://rubin/1234",  # No query part and bad UUID and no label.
+            "ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0",  # No query part.
+            "ivo://org.rubinobs/datasets?repo=dr1&id=1234",  # Bad UUID.
+            "ivo://org.rubinobs/butler?release=dr1&id=82d79caa-0823-4300-9874-67b737367ee0",  # No repo key.
+            "ivo://org.rubinobs/butler?repo=dr1&repo=dr2&id=82d79caa-0823-4300-9874-67b737367ee0",  # 2 vals.
+            "ivo://org.rubinobs/something?repo=%20&id=82d79caa-0823-4300-9874-67b737367ee0",  # no repo.
+            "https://something.edu/1234",  # Wrong scheme.
+        ):
+            with self.assertRaises(ValueError):
+                Butler.parse_dataset_uri(dataset_uri)
+
 
 class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase):
     """Run tests against DirectButler implementation using the