Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cloud protocol support, starting with BossDB #41

Merged
merged 15 commits into from
Nov 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ name:
elf-dev
dependencies:
- affogato
- imageio
- h5py
- imageio
- intern
- mrcfile
- nifty >=1.1
- numba
- pandas
- python
- pip
- python
- scikit-image
- skan
- tqdm
Expand Down
7 changes: 7 additions & 0 deletions elf/io/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .image_stack_wrapper import ImageStackFile, ImageStackDataset
from .knossos_wrapper import KnossosFile, KnossosDataset
from .mrc_wrapper import MRCFile, MRCDataset
from .intern_wrapper import InternFile, InternDataset


__all__ = [
Expand Down Expand Up @@ -72,6 +73,12 @@ def register_filetype(constructor, extensions=(), groups=(), datasets=()):
except ImportError:
mrcfile = None

# add bossdb extensions if we have intern
try:
import intern
register_filetype(InternFile, ["bossdb://"], InternFile, InternDataset)
except ImportError:
pass
j6k4m8 marked this conversation as resolved.
Show resolved Hide resolved

def identity(arg):
return arg
Expand Down
11 changes: 11 additions & 0 deletions elf/io/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
)
from .knossos_wrapper import KnossosFile, KnossosDataset
from .mrc_wrapper import MRCFile, MRCDataset
from .intern_wrapper import InternFile, InternDataset


def supported_extensions():
Expand All @@ -26,6 +27,11 @@ def open_file(path, mode='a', ext=None):
ext [str] - file extension. This can be used to force an extension
if it cannot be inferred from the filename. (default: None)
"""
# Before checking the extension suffix, check for "protocol-style"
# cloud provider prefixes.
j6k4m8 marked this conversation as resolved.
Show resolved Hide resolved
if "://" in path:
ext = path.split("://")[0] + "://"

ext = os.path.splitext(path)[1] if ext is None else ext
try:
constructor = FILE_CONSTRUCTORS[ext.lower()]
Expand Down Expand Up @@ -81,3 +87,8 @@ def is_mrc(node):
""" Check if this is a MRCWrapper object.
"""
return isinstance(node, (MRCFile, MRCDataset))

def is_intern(node):
""" Check if this is a Intern wrapper object.
"""
return isinstance(node, (InternFile, InternDataset))
92 changes: 92 additions & 0 deletions elf/io/intern_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from collections.abc import Mapping
constantinpape marked this conversation as resolved.
Show resolved Hide resolved
constantinpape marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np

try:
from intern import array

intern_imported = True
except ImportError:
intern_imported = False


def _check_intern_importable():
if not intern_imported:
raise ImportError(
"Could not import the `intern` library. This means you cannot "
"download or upload cloud datasets. To fix this, you can install "
"intern with: \n\n\t"
"pip install intern"
)
return True


class InternDataset:
def __init__(self, cloud_path):
_check_intern_importable()
self._data = array(cloud_path)

@property
def dtype(self):
return np.dtype(self._data.dtype)

@property
def ndim(self):
return 3 # todo: this COULD be 4 etc...

# TODO chunks are arbitrary, how do we handle this?
@property
def chunks(self):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there some chunking that should be observed? (Even if it's not exposed as chunks by the intern API?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We ran some benchmarks and found that there's not an enormous difference between "cuboid-aligned" and "non-aligned" reads with BossDB in terms of performance, because of the server-side cache. We can add the 512²64 chunks here but it's luckily not a big contributor/detractor to performance! I figured it made more sense to remain "agnostic" here in the same way MRC does.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And are parallel writes to data with overlapping chunks ok? If yes, and if performance is not an issue, we can return None indeed.
(Note that this will need some updates in cluster_tools then, but I think it's better to update it there so that it can deal with arbitrary chunk sizes rather than adding an artificial one here.)

return None

@property
def shape(self):
return self._data.shape

def __getitem__(self, key):
return self._data[key]

def __setitem__(self, key, value):
self._data[key] = value

@property
def size(self):
shape = self._data.shape
return shape[0] * shape[1] * shape[2]

# dummy attrs to be compatible with h5py/z5py/zarr API
@property
def attrs(self):
return {}


class InternFile(Mapping):
""" Wrapper for an intern dataset
"""

def __init__(self, path, mode="r"):
_check_intern_importable()
self.path = path
self.mode = mode

def __getitem__(self, key):
return InternDataset(self.path)

def __iter__(self):
yield "data"

def __len__(self):
return 1

def __contains__(self, name):
return name == "data"

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self._f.close()

# dummy attrs to be compatible with h5py/z5py/zarr API
@property
def attrs(self):
return {}
3 changes: 2 additions & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ name:
elf-dev
dependencies:
- affogato
- imageio
- h5py
- imageio
- intern
- mrcfile
- nifty
- numba
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
extras = {
"hdf5": "h5py",
"zarr": "zarr",
"n5": "pyn5"
"n5": "pyn5",
"cloud": "intern"
j6k4m8 marked this conversation as resolved.
Show resolved Hide resolved
}

# dependencies only available via conda,
# we still collect them here, because the conda recipe
# gets it's requirements from setuptools.
# gets its requirements from setuptools.
conda_only = ["vigra", "nifty", "z5py"]

# collect all dependencies for conda
Expand Down
48 changes: 48 additions & 0 deletions test/io_tests/test_intern_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
import unittest
from shutil import rmtree

import numpy as np

try:
from intern import array
except ImportError:
array = None


@unittest.skipIf(array is None, "Needs intern (pip install intern)")
class TestInternWrapper(unittest.TestCase):
def test_can_access_dataset(self):
from elf.io.intern_wrapper import InternDataset

# Choosing a dataset at random to make sure we can access shape and dtype
ds = InternDataset("bossdb://witvliet2020/Dataset_1/em")
self.assertEqual(ds.shape, (300, 26000, 22000))
self.assertEqual(ds.dtype, np.uint8)
self.assertEqual(ds.size, 300 * 26000 * 22000)
self.assertEqual(ds.ndim, 3)

def test_can_download_dataset(self):
from elf.io.intern_wrapper import InternDataset

ds = InternDataset("bossdb://witvliet2020/Dataset_1/em")
cutout = ds[210:212, 7000:7064, 7000:7064]
self.assertEqual(cutout.shape, (2, 64, 64))
# Pick a few random points to verify. (This is a static dataset so
# this won't fail unless the internet connection is broken.)
# These are known "magic numbers" from a known-working intern install.
self.assertEqual(cutout[0, 0, 0], 127)
self.assertEqual(cutout[0, 0, 42], 142)
self.assertEqual(cutout[0, 42, 1], 122)
self.assertEqual(cutout[1, 4, 7], 134)

def test_file(self):
from elf.io.intern_wrapper import InternFile, InternDataset

f = InternFile("bossdb://witvliet2020/Dataset_1/em")
ds = f["data"]
self.assertIsInstance(ds, InternDataset)


if __name__ == "__main__":
unittest.main()