Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-47325: Add API for parsing butler dataset URIs (butler and ivo) #1113

Merged
merged 10 commits into from
Dec 6, 2024
5 changes: 5 additions & 0 deletions doc/changes/DM-47325.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Added two new APIs for handling Butler dataset URIs.
``Butler.parse_dataset_uri`` parses a URI and returns the butler repository label and associated UUID.
``Butler.get_dataset_from_uri`` will parse a URI and attempt to retrieve the ``DatasetRef``.
URIs should be of the form IVOA identifiers as described in `DMTN-302 <https://dmtn-302.lsst.io>`_.
Deprecated ``butler://`` URIs are still supported but should not be used in new systems.
116 changes: 116 additions & 0 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@

__all__ = ["Butler"]

import dataclasses
import urllib.parse
import uuid
from abc import abstractmethod
from collections.abc import Collection, Iterable, Iterator, Mapping, Sequence
from contextlib import AbstractContextManager
Expand Down Expand Up @@ -60,6 +63,7 @@
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._file_dataset import FileDataset
from ._labeled_butler_factory import LabeledButlerFactoryProtocol
from ._storage_class import StorageClass
from ._timespan import Timespan
from .datastore import DatasetRefURIs
Expand All @@ -71,6 +75,19 @@
_LOG = getLogger(__name__)


@dataclasses.dataclass
class ParsedButlerDatasetURI:
label: str
dataset_id: uuid.UUID
uri: str


@dataclasses.dataclass
class SpecificButlerDataset:
butler: Butler
dataset: DatasetRef | None


class Butler(LimitedButler): # numpydoc ignore=PR02
"""Interface for data butler and factory for Butler instances.

Expand Down Expand Up @@ -526,6 +543,105 @@ def get_known_repos(cls) -> set[str]:
"""
return ButlerRepoIndex.get_known_repos()

@classmethod
def parse_dataset_uri(cls, uri: str) -> ParsedButlerDatasetURI:
"""Extract the butler label and dataset ID from a dataset URI.

Parameters
----------
uri : `str`
The dataset URI to parse.

Returns
-------
parsed : `ParsedButlerDatasetURI`
The label associated with the butler repository from which this
dataset originates and the ID of the dataset.

Notes
-----
Supports dataset URIs of the forms
``ivo://org.rubinobs/usdac/dr1?repo=butler_label&id=UUID`` (see
DMTN-302) and ``butler://butler_label/UUID``. The ``butler`` URI is
deprecated and can not include ``/`` in the label string. ``ivo`` URIs
can include anything supported by the `Butler` constructor, including
paths to repositories and alias labels.

ivo://org.rubinobs/dr1?repo=/repo/main&id=UUID

will return a label of ``/repo/main``.

This method does not attempt to check that the dataset exists in the
labeled butler.

Since the IVOID can be issued by any publisher to represent a Butler
dataset there is no validation of the path or netloc component of the
URI. The only requirement is that there are ``id`` and ``repo`` keys
in the ``ivo`` URI query component.
"""
parsed = urllib.parse.urlparse(uri)
parsed_scheme = parsed.scheme.lower()
if parsed_scheme == "ivo":
# Do not validate the netloc or the path values.
qs = urllib.parse.parse_qs(parsed.query)
if "repo" not in qs or "id" not in qs:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what is expected of query param keys in terms of case sensitivity. But I think no harm in treating those as case-insensitive as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory we are generating these IVO IDs so I don't really want the added complication of converting the dict to a dict with case insensitive keys (I have a recollection of using such a special dict at some point but I'm not sure where it was).

raise ValueError(f"Missing 'repo' and/or 'id' query parameters in IVOID {uri}.")
if len(qs["repo"]) != 1 or len(qs["id"]) != 1:
raise ValueError(f"Butler IVOID only supports a single value of repo and id, got {uri}")
label = qs["repo"][0]
id_ = qs["id"][0]
elif parsed_scheme == "butler":
label = parsed.netloc # Butler label is case sensitive.
# Need to strip the leading /.
id_ = parsed.path[1:]
else:
raise ValueError(f"Unrecognized URI scheme: {uri!r}")
# Strip trailing/leading whitespace from label.
label = label.strip()
if not label:
raise ValueError(f"No butler repository label found in uri {uri!r}")
try:
dataset_id = uuid.UUID(hex=id_)
except Exception as e:
e.add_note(f"Error extracting dataset ID from uri {uri!r} with dataset ID string {id_!r}")
raise

return ParsedButlerDatasetURI(label=label, dataset_id=dataset_id, uri=uri)

@classmethod
def get_dataset_from_uri(
cls, uri: str, factory: LabeledButlerFactoryProtocol | None = None
) -> SpecificButlerDataset:
"""Get the dataset associated with the given dataset URI.

Parameters
----------
uri : `str`
The URI associated with a dataset.
factory : `LabeledButlerFactoryProtocol` or `None`, optional
Bound factory function that will be given the butler label
and receive a `Butler`. If this is not provided the label
will be tried directly.

Returns
-------
result : `SpecificButlerDataset`
The butler associated with this URI and the dataset itself.
The dataset can be `None` if the UUID is valid but the dataset
is not known to this butler.
"""
parsed = cls.parse_dataset_uri(uri)
butler: Butler | None = None
if factory is not None:
# If the label is not recognized, it might be a path.
try:
butler = factory(parsed.label)
except KeyError:
pass
if butler is None:
butler = cls.from_config(parsed.label)
return SpecificButlerDataset(butler=butler, dataset=butler.get_dataset(parsed.dataset_id))

@abstractmethod
def _caching_context(self) -> AbstractContextManager[None]:
"""Context manager that enables caching."""
Expand Down
30 changes: 29 additions & 1 deletion python/lsst/daf/butler/_labeled_butler_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

__all__ = ("LabeledButlerFactory",)
__all__ = ("LabeledButlerFactory", "LabeledButlerFactoryProtocol")

from collections.abc import Callable, Mapping
from typing import Protocol

from lsst.resources import ResourcePathExpression

Expand All @@ -42,6 +43,12 @@
instance."""


class LabeledButlerFactoryProtocol(Protocol):
"""Callable to retrieve a butler from a label."""

def __call__(self, label: str) -> Butler: ...


class LabeledButlerFactory:
"""Factory for efficiently instantiating Butler instances from the
repository index file. This is intended for use from long-lived services
Expand Down Expand Up @@ -83,6 +90,27 @@ def __init__(self, repositories: Mapping[str, str] | None = None) -> None:
# This may be overridden by unit tests.
self._preload_direct_butler_cache = True

def bind(self, access_token: str | None) -> LabeledButlerFactoryProtocol:
timj marked this conversation as resolved.
Show resolved Hide resolved
"""Create a callable factory function for generating Butler instances
with out needing to specify access tokans again.

Parameters
----------
access_token : `str` or `None`
An optional access token to use for authentication with the Butler.

Returns
-------
bound : `LabeledButlerFactoryProtocol`
A callable that takes a label as input and returns a Butler
instance.
"""

def create(label: str) -> Butler:
return self.create_butler(label=label, access_token=access_token)

return create

def create_butler(self, *, label: str, access_token: str | None) -> Butler:
"""Create a Butler instance.

Expand Down
3 changes: 3 additions & 0 deletions python/lsst_daf_butler.dist-info/METADATA
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Metadata-Version: 1.0
Name: lsst-daf-butler
Version: g57cedf6216+76f9c43fa5
68 changes: 67 additions & 1 deletion tests/test_simpleButler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,14 @@
DatasetId,
DatasetRef,
DatasetType,
LabeledButlerFactory,
StorageClass,
Timespan,
)
from lsst.daf.butler.datastore.file_templates import FileTemplate
from lsst.daf.butler.registry import RegistryConfig, RegistryDefaults, _RegistryFactory
from lsst.daf.butler.tests import DatastoreMock
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, removeTestTempDir
from lsst.daf.butler.tests.utils import TestCaseMixin, makeTestTempDir, mock_env, removeTestTempDir

try:
from lsst.daf.butler.tests.server import create_test_server
Expand Down Expand Up @@ -882,10 +883,75 @@ def makeButler(self, writeable: bool = False) -> Butler:
registryConfig = RegistryConfig(config.get("registry"))
_RegistryFactory(registryConfig).create_from_config()

# Write the YAML file so that some tests can recreate butler from it.
config.dumpToUri(os.path.join(self.root, "butler.yaml"))
butler = Butler.from_config(config, writeable=writeable)
DatastoreMock.apply(butler)
return butler

def test_dataset_uris(self):
"""Test that dataset URIs can be parsed and retrieved."""
butler = self.makeButler(writeable=True)
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", "base.yaml"))
butler.import_(filename=os.path.join(TESTDIR, "data", "registry", self.datasetsImportFile))

butler.registry.defaults = RegistryDefaults(collections=["imported_g"])
ref = butler.find_dataset("flat", detector=2, physical_filter="Cam1-G")
self.assertIsInstance(ref, DatasetRef)

# Get the butler root for the URI.
config_dir = butler._config["root"]

# Read it via a repo label and a path.
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml") as index_file:
label = "test_repo"
index_file.write(f"{label}: {config_dir}\n")
index_file.flush()
with mock_env({"DAF_BUTLER_REPOSITORY_INDEX": index_file.name}):
butler_factory = LabeledButlerFactory()
factory = butler_factory.bind(access_token=None)
Copy link
Member Author

@timj timj Nov 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rra does this approach work for you in the cutout service?

factory = butler_factory.bind(access_token=token)
...
ref = Butler.get_dataset_from_uri(dataset_uri, factory=factory)

?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cutout service passes a Butler instance into the backend, so the pattern looks like:

butler_factory = LabeledButlerFactory()

def _get_backend(label: str, token: str) -> ImageCutoutBackend:
    # Called for each cutout
    factory = butler_factory.bind(access_token=token)
    butler = factory.create_butler(label=label)
    # ...
    return ImageCutoutBackend(butler, projection_finder, output, tmpdir)

Is that what you had in mind? Basically move the access token parameter to create_butler to an intermediate step to create a factory with a bound token? If so, that would be fine here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. No. It's probably the wrong API for you. Somewhere you are parsing the butler:// URI and I'm trying to provide code here that will hide the URI structure from you (so that we can also support the new ivo:// URIs). This PR creates two APIs: one that parses the URI and returns the butler repo label and the UUID, and another API (that is the one I talk about here) that lets you retrieve the DatasetRef directly from the URI and a butler factory. Maybe get_dataset_from_uri should return the Butler instance along with the DatasetRef?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I could pass the return value of bind into the backend along with the URIs as-is without parsing them and then the backend can do whatever it needs to do? That would be even more convenient. In other words, have the constructor of ImageCutoutBackend take a Butler factor instead of a Butler instance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think passing the bound factory and URI into whatever is wanting to know the DatasetRef is what we want here.


for dataset_uri in (
f"ivo://org.rubinobs/usdac/test?repo={config_dir}&id={ref.id}",
f"ivo://org.rubinobs/ukdac/lsst-dr1?repo={config_dir}%2Fbutler.yaml&id={ref.id}",
f"butler://{label}/{ref.id}",
f"ivo://org.rubinobs/usdac/lsst-dp1?repo={label}&id={ref.id}",
):
result = Butler.get_dataset_from_uri(dataset_uri)
self.assertEqual(result.dataset, ref)
# The returned butler needs to have the datastore mocked.
DatastoreMock.apply(result.butler)
dataset_id, _ = result.butler.get(result.dataset)
self.assertEqual(dataset_id, ref.id)

factory_result = Butler.get_dataset_from_uri(dataset_uri, factory=factory)
self.assertEqual(factory_result.dataset, ref)
# The returned butler needs to have the datastore mocked.
DatastoreMock.apply(factory_result.butler)
dataset_id, _ = factory_result.butler.get(factory_result.dataset)
self.assertEqual(dataset_id, ref.id)

# Non existent dataset.
missing_id = str(ref.id).replace("2", "3")
result = Butler.get_dataset_from_uri(f"butler://{label}/{missing_id}")
self.assertIsNone(result.dataset)

# Test some failure modes.
for dataset_uri in (
"butler://label/1234", # Bad UUID.
"butler://1234", # No UUID.
"butler:///1234", # No label.
"ivo://rubin/1234", # No query part and bad UUID and no label.
"ivo://rubin/datasets/dr1/82d79caa-0823-4300-9874-67b737367ee0", # No query part.
"ivo://org.rubinobs/datasets?repo=dr1&id=1234", # Bad UUID.
"ivo://org.rubinobs/butler?release=dr1&id=82d79caa-0823-4300-9874-67b737367ee0", # No repo key.
"ivo://org.rubinobs/butler?repo=dr1&repo=dr2&id=82d79caa-0823-4300-9874-67b737367ee0", # 2 vals.
"ivo://org.rubinobs/something?repo=%20&id=82d79caa-0823-4300-9874-67b737367ee0", # no repo.
"https://something.edu/1234", # Wrong scheme.
):
timj marked this conversation as resolved.
Show resolved Hide resolved
with self.assertRaises(ValueError):
Butler.parse_dataset_uri(dataset_uri)


class NameKeyCollectionManagerDirectSimpleButlerTestCase(DirectSimpleButlerTestCase, unittest.TestCase):
"""Run tests against DirectButler implementation using the
Expand Down
Loading