diff --git a/3rdparty/LICENSE.pba+ b/3rdparty/LICENSE.pba+
new file mode 100644
index 000000000..9d0b4030a
--- /dev/null
+++ b/3rdparty/LICENSE.pba+
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 School of Computing, National University of Singapore
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 387d53820..18d0f992e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,51 @@
+# cuCIM 22.08.00 (17 Aug 2022)
+
+## 🚨 Breaking Changes
+
+- Stain extraction: use a less strict condition across channels when thresholding ([#316](https://github.com/rapidsai/cucim/pull/316)) [@grlee77](https://github.com/grlee77)
+
+## 🐛 Bug Fixes
+
+- create SimilarityTransform using CuPy 9.x-compatible indexing ([#365](https://github.com/rapidsai/cucim/pull/365)) [@grlee77](https://github.com/grlee77)
+- Add `__init__.py` in `cucim.core` ([#359](https://github.com/rapidsai/cucim/pull/359)) [@jakirkham](https://github.com/jakirkham)
+- Stain extraction: use a less strict condition across channels when thresholding ([#316](https://github.com/rapidsai/cucim/pull/316)) [@grlee77](https://github.com/grlee77)
+- Incorporate bug fixes from skimage 0.19.3 ([#312](https://github.com/rapidsai/cucim/pull/312)) [@grlee77](https://github.com/grlee77)
+- fix RawKernel bug for canny filter when quantiles are used ([#310](https://github.com/rapidsai/cucim/pull/310)) [@grlee77](https://github.com/grlee77)
+
+## 📖 Documentation
+
+- Defer loading of `custom.js` ([#383](https://github.com/rapidsai/cucim/pull/383)) [@galipremsagar](https://github.com/galipremsagar)
+- add cucim.core.morphology to API docs + other docstring fixes ([#367](https://github.com/rapidsai/cucim/pull/367)) [@grlee77](https://github.com/grlee77)
+- Update README.md ([#361](https://github.com/rapidsai/cucim/pull/361)) [@HesAnEasyCoder](https://github.com/HesAnEasyCoder)
+- remove unimplemented functions from See Also and fix version numbers in deprecation warnings ([#356](https://github.com/rapidsai/cucim/pull/356)) [@grlee77](https://github.com/grlee77)
+- Forward-merge branch-22.06 to branch-22.08 ([#344](https://github.com/rapidsai/cucim/pull/344)) [@grlee77](https://github.com/grlee77)
+- Update README.md ([#315](https://github.com/rapidsai/cucim/pull/315)) [@HesAnEasyCoder](https://github.com/HesAnEasyCoder)
+- Update index.rst ([#314](https://github.com/rapidsai/cucim/pull/314)) [@HesAnEasyCoder](https://github.com/HesAnEasyCoder)
+- Update PyPI package documentation for v22.06.00 ([#311](https://github.com/rapidsai/cucim/pull/311)) [@gigony](https://github.com/gigony)
+
+## 🚀 New Features
+
+- Add segmentation with the Chan-Vese active contours method ([#343](https://github.com/rapidsai/cucim/pull/343)) [@grlee77](https://github.com/grlee77)
+- Add cucim.skimage.morphology.medial_axis ([#342](https://github.com/rapidsai/cucim/pull/342)) [@grlee77](https://github.com/grlee77)
+- Add cucim.skimage.segmentation.expand_labels ([#341](https://github.com/rapidsai/cucim/pull/341)) [@grlee77](https://github.com/grlee77)
+- Add Euclidean distance transform for images/volumes ([#318](https://github.com/rapidsai/cucim/pull/318)) [@grlee77](https://github.com/grlee77)
+
+## 🛠️ Improvements
+
+- Revert &quot;Allow CuPy 11&quot; ([#362](https://github.com/rapidsai/cucim/pull/362)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix issues with day &amp; night modes in python docs ([#360](https://github.com/rapidsai/cucim/pull/360)) [@galipremsagar](https://github.com/galipremsagar)
+- Allow CuPy 11 ([#357](https://github.com/rapidsai/cucim/pull/357)) [@jakirkham](https://github.com/jakirkham)
+- more efficient separable convolution ([#355](https://github.com/rapidsai/cucim/pull/355)) [@grlee77](https://github.com/grlee77)
+- Support resolution and spacing metadata ([#349](https://github.com/rapidsai/cucim/pull/349)) [@gigony](https://github.com/gigony)
+- Performance optimizations to morphological segmentation functions ([#340](https://github.com/rapidsai/cucim/pull/340)) [@grlee77](https://github.com/grlee77)
+- benchmarks: avoid use of deprecated pandas API ([#339](https://github.com/rapidsai/cucim/pull/339)) [@grlee77](https://github.com/grlee77)
+- Reduce memory overhead and improve performance of normalize_colors_pca ([#328](https://github.com/rapidsai/cucim/pull/328)) [@grlee77](https://github.com/grlee77)
+- Protect against obscure divide by zero error in edge case of `normalize_colors_pca` ([#327](https://github.com/rapidsai/cucim/pull/327)) [@grlee77](https://github.com/grlee77)
+- complete parametrization of cucim.skimage benchmarks ([#324](https://github.com/rapidsai/cucim/pull/324)) [@grlee77](https://github.com/grlee77)
+- parameterization of `filters` and `features` benchmarks (v2) ([#322](https://github.com/rapidsai/cucim/pull/322)) [@grlee77](https://github.com/grlee77)
+- Add a fast histogram-based median filter ([#317](https://github.com/rapidsai/cucim/pull/317)) [@grlee77](https://github.com/grlee77)
+- Remove custom compiler environment variables ([#307](https://github.com/rapidsai/cucim/pull/307)) [@ajschmidt8](https://github.com/ajschmidt8)
+
 # cuCIM 22.06.00 (7 Jun 2022)
 
 ## 🚨 Breaking Changes
@@ -17,6 +65,7 @@
 ## 🚀 New Features
 
 - add missing `cucim.skimage.segmentation.clear_border` function ([#267](https://github.com/rapidsai/cucim/pull/267)) [@grlee77](https://github.com/grlee77)
+- add `cucim.core.operations.color.stain_extraction_pca` and `cucim.core.operations.color.normalize_colors_pca` for digital pathology H&E stain extraction and normalization ([#273](https://github.com/rapidsai/cucim/pull/273)) [@grlee77](https://github.com/grlee77), [@drbeh](https://github.com/drbeh)
 
 ## 🛠️ Improvements
 
@@ -27,6 +76,7 @@
 - Promote small integer types to single rather than double precision ([#278](https://github.com/rapidsai/cucim/pull/278)) [@grlee77](https://github.com/grlee77)
 - improve efficiency of histogram-based thresholding functions ([#276](https://github.com/rapidsai/cucim/pull/276)) [@grlee77](https://github.com/grlee77)
 - Remove unused dependencies in GPU tests job ([#268](https://github.com/rapidsai/cucim/pull/268)) [@Ethyling](https://github.com/Ethyling)
+- Enable footprint decomposition for morphology ([#274](https://github.com/rapidsai/cucim/pull/274)) [@grlee77](https://github.com/grlee77)
 - Use conda compilers ([#232](https://github.com/rapidsai/cucim/pull/232)) [@Ethyling](https://github.com/Ethyling)
 - Build packages using mambabuild ([#216](https://github.com/rapidsai/cucim/pull/216)) [@Ethyling](https://github.com/Ethyling)
 
diff --git a/LICENSE-3rdparty.md b/LICENSE-3rdparty.md
index 0a3789af7..984d0e952 100644
--- a/LICENSE-3rdparty.md
+++ b/LICENSE-3rdparty.md
@@ -281,3 +281,9 @@ StainTools
   - https://github.com/Peter554/StainTools/blob/master/LICENSE.txt
 - Copyright: Peter Byfield
 - Usage: reference for stain color normalization algorithm
+
+PBA+
+- License: MIT License
+  - https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus/blob/master/LICENSE
+- Copyright: School of Computing, National University of Singapore
+- Usage: PBA+ is used to implement the Euclidean distance transform.
diff --git a/README.md b/README.md
index 9f31f70ce..612fda219 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,43 @@
 # <div align="left"><img src="https://rapids.ai/assets/images/rapids_logo.png" width="90px"/>&nbsp;cuCIM</div>
 
+[RAPIDS](https://rapids.ai) cuCIM is an open-source, accelerated computer vision and image processing software library for multidimensional images used in biomedical, geospatial, material and life science, and remote sensing use cases.
 
-[RAPIDS](https://rapids.ai) cuCIM is an extensible toolkit designed to provide GPU accelerated I/O, computer vision & image processing primitives for N-Dimensional images with a focus on biomedical imaging.
+cuCIM offers:
+
+- Enhanced Image Processing Capabilities for large and n-dimensional tag image file format (TIFF) files
+- Accelerated performance through Graphics Processing Unit (GPU)-based image processing and computer vision primitives
+- A Straightforward Pythonic Interface with Matching Application Programming Interface (API) for Openslide
+
+cuCIM supports the following formats:
+
+- Aperio ScanScope Virtual Slide (SVS)
+- Philips TIFF
+- Generic Tiled, Multi-resolution RGB TIFF files with the following compression schemes:
+  - No Compression
+  - JPEG
+  - JPEG2000
+  - Lempel-Ziv-Welch (LZW)
+  - Deflate
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cucim/blob/main/README.md) ensure you are on the `main` branch.
 
-- [GTC 2021 cuCIM: A GPU Image I/O and Processing Toolkit [S32194]](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=cucim&sort=date)
-  - [video](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s32194/)
+- [GTC 2022 Accelerating Storage IO to GPUs with Magnum IO [S41347]](https://events.rainfocus.com/widget/nvidia/gtcspring2022/sessioncatalog/session/1634960000577001Etxp)
+  - cuCIM's GDS API examples: <https://github.com/NVIDIA/MagnumIO/tree/main/gds/readers/cucim-gds>
 - [SciPy 2021 cuCIM - A GPU image I/O and processing library](https://www.scipy2021.scipy.org/)
   - [video](https://youtu.be/G46kOOM9xbQ)
+- [GTC 2021 cuCIM: A GPU Image I/O and Processing Toolkit [S32194]](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=cucim&sort=date)
+  - [video](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s32194/)
+
+**Blogs**
+- [Enhanced Image Analysis with Multidimensional Image Processing](https://developer.nvidia.com/blog/enhanced-image-analysis-with-multidimensional-image-processing/)
+- [Accelerating Scikit-Image API with cuCIM: n-Dimensional Image Processing and IO on GPUs](https://developer.nvidia.com/blog/cucim-rapid-n-dimensional-image-processing-and-i-o-on-gpus/)
+- [Accelerating Digital Pathology Pipelines with NVIDIA Clara™ Deploy](https://developer.nvidia.com/blog/accelerating-digital-pathology-pipelines-with-nvidia-clara-deploy-2/)
+
+**Webinars**
+
+- [cuCIM: a GPU Image IO and Processing Library](https://www.youtube.com/watch?v=G46kOOM9xbQ)
+
+**[Documentation](https://docs.rapids.ai/api/cucim/stable)**
 
 **Release notes** are available on our [wiki page](https://github.com/rapidsai/cucim/wiki/Release-Notes).
 
@@ -39,7 +68,7 @@ pip install scipy scikit-image cupy-cuda110
 
 ### Notebooks
 
-Please check out our [Welcome](notebooks/Welcome.ipynb) notebook ([NBViewer](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.06/notebooks/Welcome.ipynb))
+Please check out our [Welcome](notebooks/Welcome.ipynb) notebook ([NBViewer](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.08/notebooks/Welcome.ipynb))
 
 #### Downloading sample images
 
diff --git a/VERSION b/VERSION
index ddc883c42..231fac64b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-22.06.00
+22.08.00
diff --git a/benchmarks/skimage/_image_bench.py b/benchmarks/skimage/_image_bench.py
index c7d5abfd8..854fae290 100644
--- a/benchmarks/skimage/_image_bench.py
+++ b/benchmarks/skimage/_image_bench.py
@@ -36,6 +36,7 @@ def __init__(
         module_cpu=scipy.ndimage,
         module_gpu=cupyx.scipy.ndimage,
         function_is_generator=False,
+        run_cpu=True
     ):
 
         self.shape = shape
@@ -67,9 +68,12 @@ def gen_gpu(*args, **kwargs):
         self.module_name_cpu = module_cpu.__name__
         self.module_name_gpu = module_gpu.__name__
 
+        self.run_cpu = run_cpu
+
     def set_args(self, dtype):
         if np.dtype(dtype).kind in "iu":
             im1 = skimage.data.camera()
+            im1 = im1.astype(dtype)
         else:
             im1 = skimage.data.camera() / 255.0
             im1 = im1.astype(dtype)
@@ -166,17 +170,20 @@ def run_benchmark(self, duration=3, verbose=True):
                 rep_kwargs_gpu = self.get_reps(
                     self.func_gpu, self.args_gpu, kw_gpu, duration, cpu=False
                 )
-                perf = repeat(self.func_cpu, self.args_cpu, kw_cpu, **rep_kwargs_cpu)
+                print("Number of Repetitions : ", rep_kwargs_gpu)
                 perf_gpu = repeat(self.func_gpu, self.args_gpu, kw_gpu, **rep_kwargs_gpu)
-                df.at[index, "GPU accel"] = perf.cpu_times.mean() / perf_gpu.gpu_times.mean()
+                
                 df.at[index, "shape"] = f"{self.shape}"
                 # df.at[index,  "description"] = index
                 df.at[index, "function_name"] = self.function_name
                 df.at[index, "dtype"] = np.dtype(dtype).name
                 df.at[index, "ndim"] = len(self.shape)
 
-                df.at[index, "CPU: host (mean)"] = perf.cpu_times.mean()
-                df.at[index, "CPU: host (std)"] = perf.cpu_times.std()
+                if self.run_cpu == True:
+                    perf = repeat(self.func_cpu, self.args_cpu, kw_cpu, **rep_kwargs_cpu)
+                    df.at[index, "GPU accel"] = perf.cpu_times.mean() / perf_gpu.gpu_times.mean()
+                    df.at[index, "CPU: host (mean)"] = perf.cpu_times.mean()
+                    df.at[index, "CPU: host (std)"] = perf.cpu_times.std()
 
                 df.at[index, "GPU: host (mean)"] = perf_gpu.cpu_times.mean()
                 df.at[index, "GPU: host (std)"] = perf_gpu.cpu_times.std()
diff --git a/benchmarks/skimage/bench_convolve.py b/benchmarks/skimage/bench_convolve.py
new file mode 100644
index 000000000..60007b8a3
--- /dev/null
+++ b/benchmarks/skimage/bench_convolve.py
@@ -0,0 +1,139 @@
+"""
+Benchmark locally modified ndimage functions vs. their CuPy counterparts
+"""
+import cupy as cp
+import cupyx.scipy.ndimage as ndi
+import pytest
+from cupyx.profiler import benchmark
+
+from cucim.skimage._vendored.ndimage import (
+    convolve1d, correlate1d, gaussian_filter, gaussian_filter1d,
+    gaussian_gradient_magnitude, gaussian_laplace, laplace, prewitt, sobel,
+    uniform_filter, uniform_filter1d,
+)
+
+d = cp.cuda.Device()
+
+
+def _get_image(shape, dtype, seed=123):
+
+    rng = cp.random.default_rng(seed)
+    dtype = cp.dtype(dtype)
+    if dtype.kind == 'b':
+        image = rng.integers(0, 1, shape, dtype=cp.uint8).astype(bool)
+    elif dtype.kind in 'iu':
+        image = rng.integers(0, 128, shape, dtype=dtype)
+    elif dtype.kind in 'c':
+        real_dtype = cp.asarray([], dtype=dtype).real.dtype
+        image = rng.standard_normal(shape, dtype=real_dtype)
+        image = image + 1j * rng.standard_normal(shape, dtype=real_dtype)
+    else:
+        if dtype == cp.float16:
+            image = rng.standard_normal(shape).astype(dtype)
+        else:
+            image = rng.standard_normal(shape, dtype=dtype)
+    return image
+
+
+def _compare_implementations(
+    shape, kernel_size, axis, dtype, mode, cval=0.0, origin=0,
+    output_dtype=None, kernel_dtype=None, output_preallocated=False,
+    function=convolve1d, max_duration=1
+):
+    dtype = cp.dtype(dtype)
+    if kernel_dtype is None:
+        kernel_dtype = dtype
+    image = _get_image(shape, dtype)
+    kernel = _get_image((kernel_size,), kernel_dtype)
+    kwargs = dict(axis=axis, mode=mode, cval=cval, origin=origin)
+    if output_dtype is not None:
+        output_dtype = cp.dtype(output_dtype)
+    function_ref = getattr(ndi, function.__name__)
+    if output_preallocated:
+        if output_dtype is None:
+            output_dtype = image.dtype
+        output1 = cp.empty(image.shape, dtype=output_dtype)
+        output2 = cp.empty(image.shape, dtype=output_dtype)
+        kwargs.update(dict(output=output1))
+        perf1 = benchmark(function_ref, (image, kernel), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+        kwargs.update(dict(output=output2, algorithm='shared_memory'))
+        perf2 = benchmark(function, (image, kernel), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+        return perf1, perf2
+    kwargs.update(dict(output=output_dtype))
+    perf1 = benchmark(function_ref, (image, kernel), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+    kwargs.update(dict(output=output_dtype, algorithm='shared_memory'))
+    perf2 = benchmark(function, (image, kernel), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+    return perf1, perf2
+
+
+def _compare_implementations_other(
+    shape, dtype, mode, cval=0.0,
+    output_dtype=None, kernel_dtype=None, output_preallocated=False,
+    function=convolve1d, func_kwargs={}, max_duration=1,
+):
+    dtype = cp.dtype(dtype)
+    image = _get_image(shape, dtype)
+    kwargs = dict(mode=mode, cval=cval)
+    if func_kwargs:
+        kwargs.update(func_kwargs)
+    if output_dtype is not None:
+        output_dtype = cp.dtype(output_dtype)
+    function_ref = getattr(ndi, function.__name__)
+    if output_preallocated:
+        if output_dtype is None:
+            output_dtype = image.dtype
+        output1 = cp.empty(image.shape, dtype=output_dtype)
+        output2 = cp.empty(image.shape, dtype=output_dtype)
+        kwargs.update(dict(output=output1))
+        perf1 = benchmark(function_ref, (image,), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+        kwargs.update(dict(output=output1, algorithm='shared_memory'))
+        perf2 = benchmark(function, (image,), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+        return perf1, perf2
+    kwargs.update(dict(output=output_dtype))
+    perf1 = benchmark(function_ref, (image,), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+    kwargs.update(dict(output=output_dtype, algorithm='shared_memory'))
+    perf2 = benchmark(function, (image,), kwargs=kwargs, n_warmup=10, n_repeat=10000, max_duration=max_duration)
+    return perf1, perf2
+
+
+print("\n\n")
+print("function | shape | dtype | mode | kernel size | preallocated | axis | dur (ms), CuPy | dur (ms), cuCIM | acceleration ")
+print("---------|-------|-------|------|-------------|--------------|------|----------------|-----------------|--------------")
+for function in [convolve1d]:
+    for shape in [(512, 512), (3840, 2160), (64, 64, 64), (256, 256, 256)]:
+        for dtype in [cp.float32, cp.uint8]:
+            for mode in ['nearest']:
+                for kernel_size in [3, 7, 11, 41]:
+                    for output_preallocated in [False]:  # , True]:
+                        for axis in range(len(shape)):
+                            output_dtype = dtype
+                            perf1, perf2 = _compare_implementations(shape=shape, kernel_size=kernel_size, mode=mode, axis=axis, dtype=dtype, output_dtype=output_dtype, output_preallocated=output_preallocated, function=function)
+                            t_elem = perf1.gpu_times * 1000.
+                            t_shared = perf2.gpu_times * 1000.
+                            print(f"{function.__name__} | {shape} | {cp.dtype(dtype).name} | {mode} | {kernel_size=} | prealloc={output_preallocated} | {axis=} | {t_elem.mean():0.3f} +/- {t_elem.std():0.3f}  | {t_shared.mean():0.3f} +/- {t_shared.std():0.3f} | {t_elem.mean() / t_shared.mean():0.3f}")
+
+
+print("function | kwargs | shape | dtype | mode | preallocated | dur (ms), CuPy | dur (ms), cuCIM | acceleration ")
+print("---------|--------|-------|-------|------|--------------|----------------|-----------------|--------------")
+for function, func_kwargs in [
+    # (gaussian_filter1d, dict(sigma=1.0, axis=0)),
+    # (gaussian_filter1d, dict(sigma=1.0, axis=-1)),
+    # (gaussian_filter1d, dict(sigma=4.0, axis=0)),
+    # (gaussian_filter1d, dict(sigma=4.0, axis=-1)),
+    (gaussian_filter, dict(sigma=1.0)),
+    (gaussian_filter, dict(sigma=4.0)),
+    (uniform_filter, dict(size=11)),
+    (prewitt, dict(axis=0)),
+    (sobel, dict(axis=0)),
+    (prewitt, dict(axis=-1)),
+    (sobel, dict(axis=-1)),
+]:
+    for shape in [(512, 512), (3840, 2160), (64, 64, 64), (256, 256, 256)]:
+        for (dtype, output_dtype) in [(cp.float32, cp.float32), (cp.uint8, cp.float32)]:
+            for mode in ['nearest']:
+                for output_preallocated in [False, True]:
+                    perf1, perf2 = _compare_implementations_other(shape=shape, mode=mode, dtype=dtype, output_dtype=output_dtype, output_preallocated=output_preallocated, function=function, func_kwargs=func_kwargs)
+                    t_elem = perf1.gpu_times * 1000.
+                    t_shared = perf2.gpu_times * 1000.
+                    print(f"{function.__name__} | {func_kwargs} | {shape} | {cp.dtype(dtype).name} | {mode} | {output_preallocated} | {t_elem.mean():0.3f} +/- {t_elem.std():0.3f}  | {t_shared.mean():0.3f} +/- {t_shared.std():0.3f} | {t_elem.mean() / t_shared.mean():0.3f}")
+
diff --git a/benchmarks/skimage/cucim_color_bench.py b/benchmarks/skimage/cucim_color_bench.py
index 66ce6b74c..87955dd0a 100644
--- a/benchmarks/skimage/cucim_color_bench.py
+++ b/benchmarks/skimage/cucim_color_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import pickle
 
@@ -14,6 +15,7 @@
 
 from _image_bench import ImageBench
 
+func_name_choices = ['convert_colorspace', 'rgb2hed', 'hed2rgb', 'lab2lch', 'lch2lab', 'xyz2lab', 'lab2xyz', 'rgba2rgb', 'label2rgb']
 
 class ColorBench(ImageBench):
     def set_args(self, dtype):
@@ -47,6 +49,7 @@ def __init__(
         index_str=None,
         module_cpu=scipy.ndimage,
         module_gpu=cupyx.scipy.ndimage,
+        run_cpu=True,
     ):
         self.contiguous_labels = contiguous_labels
         super().__init__(
@@ -58,6 +61,7 @@ def __init__(
             index_str=index_str,
             module_cpu=module_cpu,
             module_gpu=module_gpu,
+            run_cpu=run_cpu,
         )
 
     def set_args(self, dtype):
@@ -70,14 +74,12 @@ def set_args(self, dtype):
             ],
             dtype=int,
         )
-        tiling = tuple(s // a_s for s, a_s in zip(shape, a.shape))
+        tiling = tuple(s // a_s for s, a_s in zip(self.shape, a.shape))
         if self.contiguous_labels:
             label = np.kron(a, np.ones(tiling, dtype=a.dtype))
         else:
             label = np.tile(a, tiling)
         labeld = cp.asarray(label)
-        if self.shape[-1] != 3:
-            raise ValueError("shape must be 3 on the last axis")
         imaged = cupy.testing.shaped_random(labeld.shape, xp=cp, dtype=dtype, scale=1.0)
         image = cp.asnumpy(imaged)
         self.args_cpu = (
@@ -90,96 +92,125 @@ def set_args(self, dtype):
         )
 
 
-pfile = "cucim_color_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-all_colorspaces = True
-
-for shape in [(256, 256, 3), (3840, 2160, 3), (192, 192, 192, 3)]:
-    ndim = len(shape)
+def main(args):
 
-    if all_colorspaces:
-        color_spaces = ["RGB", "HSV", "RGB CIE", "XYZ", "YUV", "YIQ", "YPbPr", "YCbCr", "YDbDr"]
+    pfile = "cucim_color_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
     else:
-        color_spaces = ["RGB", "HSV", "YUV", "XYZ"]
-    for fromspace in color_spaces:
-        for tospace in color_spaces:
-            if fromspace == tospace:
-                continue
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    all_colorspaces = False
 
+    ndim = len(shape)
+
+    for function_name in func_name_choices:
+
+        if function_name != args.func_name:
+            continue
+
+        if function_name == 'convert_colorspace':
+            if all_colorspaces:
+                color_spaces = ["RGB", "HSV", "RGB CIE", "XYZ", "YUV", "YIQ", "YPbPr", "YCbCr", "YDbDr"]
+            else:
+                color_spaces = ["RGB", "HSV", "YUV", "XYZ"]
+            for fromspace in color_spaces:
+                for tospace in color_spaces:
+                    if fromspace == tospace:
+                        continue
+
+                    B = ColorBench(
+                        function_name="convert_colorspace",
+                        shape=shape + (3,),
+                        dtypes=dtypes,
+                        fixed_kwargs=dict(fromspace=fromspace, tospace=tospace),
+                        var_kwargs={},
+                        index_str=f"{fromspace.lower()}2{tospace.lower()}",
+                        module_cpu=skimage.color,
+                        module_gpu=cucim.skimage.color,
+                        run_cpu=run_cpu,
+                    )
+                    results = B.run_benchmark(duration=args.duration)
+                    all_results = pd.concat([all_results, results["full"]])
+
+        elif function_name == 'rgba2rgb':
+            B = RGBABench(
+                function_name="rgba2rgb",
+                shape=shape[:-1] + (4,),
+                dtypes=dtypes,
+                fixed_kwargs={},
+                var_kwargs={},
+                module_cpu=skimage.color,
+                module_gpu=cucim.skimage.color,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+        elif function_name == 'label2rgb':
+
+            for contiguous_labels in [True, False]:
+                if contiguous_labels:
+                    index_str = "contiguous"
+                else:
+                    index_str = None
+                B = LabelBench(
+                    function_name="label2rgb",
+                    shape=shape,
+                    dtypes=dtypes,
+                    contiguous_labels=contiguous_labels,
+                    index_str=index_str,
+                    fixed_kwargs=dict(bg_label=0),
+                    var_kwargs=dict(kind=["avg", "overlay"]),
+                    module_cpu=skimage.color,
+                    module_gpu=cucim.skimage.color,
+                    run_cpu=run_cpu,
+                )
+                results = B.run_benchmark(duration=args.duration)
+                all_results = pd.concat([all_results, results["full"]])
+
+        elif function_name in [
+            'rgb2hed', 'hed2rgb', 'lab2lch', 'lch2lab', 'xyz2lab', 'lab2xyz'
+        ]:
             B = ColorBench(
-                function_name="convert_colorspace",
-                shape=shape,
+                function_name=function_name,
+                shape=shape + (3,),
                 dtypes=dtypes,
-                fixed_kwargs=dict(fromspace=fromspace, tospace=tospace),
+                fixed_kwargs={},
                 var_kwargs={},
-                index_str=f"{fromspace.lower()}2{tospace.lower()}",
                 module_cpu=skimage.color,
                 module_gpu=cucim.skimage.color,
+                run_cpu=run_cpu,
             )
-            results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
-
-    # rgb2hed and hed2rgb test combine_stains and separate_stains and all other
-    # stains should have equivalent performance.
-    #
-    # Probably not necessary to benchmark combine_stains and separate_stains
-    # e.g.
-    #    ihc_hdx = separate_stains(ihc, hdx_from_rgb)
-    #    ihc = combine_stains(ihc_hdx, rgb_from_hdx)
-    #
-
-    for fname in ["rgb2hed", "hed2rgb", "lab2lch", "lch2lab", "xyz2lab",
-                  "lab2xyz"]:
-        B = ColorBench(
-            function_name=fname,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs={},
-            var_kwargs={},
-            module_cpu=skimage.color,
-            module_gpu=cucim.skimage.color,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-    B = RGBABench(
-        function_name="rgba2rgb",
-        shape=shape[:-1] + (4,),
-        dtypes=dtypes,
-        fixed_kwargs={},
-        var_kwargs={},
-        module_cpu=skimage.color,
-        module_gpu=cucim.skimage.color,
-    )
-    results = B.run_benchmark(duration=1)
-    all_results = all_results.append(results["full"])
-
-    for contiguous_labels in [True, False]:
-        if contiguous_labels:
-            index_str = "contiguous"
-        else:
-            index_str = None
-        B = LabelBench(
-            function_name="label2rgb",
-            shape=shape,
-            dtypes=dtypes,
-            contiguous_labels=contiguous_labels,
-            index_str=index_str,
-            fixed_kwargs=dict(bg_label=0),
-            var_kwargs=dict(kind=["avg", "overlay"]),
-            module_cpu=skimage.color,
-            module_gpu=cucim.skimage.color,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM color conversion functions')
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_exposure_bench.py b/benchmarks/skimage/cucim_exposure_bench.py
index 9f7ffbec2..d0fb4dbda 100644
--- a/benchmarks/skimage/cucim_exposure_bench.py
+++ b/benchmarks/skimage/cucim_exposure_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import pickle
 
@@ -27,6 +28,7 @@ def set_args(self, dtype):
 
 class MatchHistogramBench(ImageBench):
     def set_args(self, dtype):
+
         if np.dtype(dtype).kind in "iu":
             scale = 256
         else:
@@ -39,95 +41,101 @@ def set_args(self, dtype):
         self.args_gpu = (imaged, imaged2)
 
 
-pfile = "cucim_exposure_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.uint8, np.float32]
-
-exposure_config = {
-    "equalize_adapthist": dict(
-        fixed_kwargs=dict(clip_limit=0.01, nbins=256),
-        variable_kwargs=dict(),
-        color_required=False,
-        grayscale_only=False,
-        dtypes=None,
-        shapes=None,
-    ),
-    "histogram": dict(
-        fixed_kwargs=dict(source_range="image"),
-        variable_kwargs=dict(nbins=[16, 256], normalize=[True, False]),
-        color_required=False,
-        grayscale_only=True,
-        dtypes=None,
-        shapes=None,
-    ),
-}
-
-for function_name, fixed_kwargs, var_kwargs, allow_color in [
-    ("equalize_adapthist", dict(clip_limit=0.01, nbins=256), dict(), True),
-    (
-        "histogram",
-        dict(source_range="image"),
-        dict(nbins=[16, 256], normalize=[True, False]),
-        False,
-    ),
-    ("cumulative_distribution", dict(), dict(nbins=[16, 256]), False),
-    ("equalize_hist", dict(mask=None), dict(nbins=[16, 256]), False),
-    ("rescale_intensity", dict(in_range="image", out_range="dtype"), dict(), False),
-    ("adjust_gamma", dict(), dict(), False),
-    ("adjust_log", dict(), dict(), False),
-    ("adjust_sigmoid", dict(), dict(inv=[False, True]), False),
-    ("is_low_contrast", dict(), dict(), False),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-        ndim = len(shape)
-        if shape[-1] == 3 and not allow_color:
+def main(args):
+
+    pfile = "cucim_exposure_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color in [
+        ("equalize_adapthist", dict(clip_limit=0.01, nbins=256), dict(), True),
+        (
+            "histogram",
+            dict(source_range="image"),
+            dict(nbins=[16, 256], normalize=[True, False]),
+            False,
+        ),
+        ("cumulative_distribution", dict(), dict(nbins=[16, 256]), False),
+        ("equalize_hist", dict(mask=None), dict(nbins=[16, 256]), False),
+        ("rescale_intensity", dict(in_range="image", out_range="dtype"), dict(), False),
+        ("adjust_gamma", dict(), dict(), False),
+        ("adjust_log", dict(), dict(), False),
+        ("adjust_sigmoid", dict(), dict(inv=[False, True]), False),
+        ("is_low_contrast", dict(), dict(), False),
+    ]:
+
+        if function_name != args.func_name:
             continue
 
-        if function_name == "equalize_adapthist":
-            #  TODO: fix equalize_adapthist for size (3840, 2160) and kernel_size = [16, 16]
-            size_factors = [4, 8, 16]
-            kernel_sizes = []
-            for size_factor in size_factors:
-                kernel_sizes.append([max(s // size_factor, 1) for s in shape if s != 3])
-            var_kwargs.update(dict(kernel_size=kernel_sizes))
-
-        B = ExposureBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.exposure,
-            module_gpu=cucim.skimage.exposure,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-    ndim = len(shape)
-
-    channel_axis = -1 if shape[-1] in [3, 4] else None
-
-    B = MatchHistogramBench(
-        function_name="match_histograms",
-        shape=shape,
-        dtypes=dtypes,
-        fixed_kwargs=dict(channel_axis=channel_axis),
-        var_kwargs=dict(),
-        module_cpu=skimage.exposure,
-        module_gpu=cucim.skimage.exposure,
-    )
-    results = B.run_benchmark(duration=1)
-    all_results = all_results.append(results["full"])
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+        ndim = len(shape)
+
+        if function_name == 'match_histograms':
+
+            channel_axis = -1 if shape[-1] in [3, 4] else None
+
+            B = MatchHistogramBench(
+                function_name="match_histograms",
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=dict(channel_axis=channel_axis),
+                var_kwargs=dict(),
+                module_cpu=skimage.exposure,
+                module_gpu=cucim.skimage.exposure,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+        else:
+
+            if shape[-1] == 3 and not allow_color:
+                continue
+
+            if function_name == "equalize_adapthist":
+                #  TODO: fix equalize_adapthist for size (3840, 2160) and kernel_size = [16, 16]
+                size_factors = [4, 8, 16]
+                kernel_sizes = []
+                for size_factor in size_factors:
+                    kernel_sizes.append([max(s // size_factor, 1) for s in shape if s != 3])
+                var_kwargs.update(dict(kernel_size=kernel_sizes))
+
+            B = ExposureBench(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.exposure,
+                module_gpu=cucim.skimage.exposure,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    with open(fbase + ".md", "wt") as f:
+        f.write(all_results.to_markdown())
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM exposure functions')
+    func_name_choices = ['equalize_adapthist', 'cumulative_distribution', 'equalize_hist', 'rescale_intensity', 'adjust_gamma', 'adjust_log', 'adjust_sigmoid', 'is_low_contrast', 'match_histograms']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices=dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices=func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_feature_bench.py b/benchmarks/skimage/cucim_feature_bench.py
index 800086f7a..05f61493f 100644
--- a/benchmarks/skimage/cucim_feature_bench.py
+++ b/benchmarks/skimage/cucim_feature_bench.py
@@ -1,5 +1,6 @@
 import os
 import pickle
+import argparse
 
 import cucim.skimage
 import cucim.skimage.feature
@@ -26,102 +27,119 @@ def set_args(self, dtype):
         self.args_cpu = (image, template)
         self.args_gpu = (imaged, templated)
 
-
-pfile = "cucim_feature_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    ("multiscale_basic_features", dict(edges=True), dict(texture=[True, False]), True, True),
-    ("canny", dict(sigma=1.8), dict(), False, False),
-    # reduced default rings, histograms, orientations to fit daisy at (3840, 2160) into GPU memory
-    (
-        "daisy",
-        dict(step=4, radius=15, rings=2, histograms=5, orientations=4),
-        dict(normalization=["l1", "l2", "daisy"]),
-        False,
-        False,
-    ),
-    ("structure_tensor", dict(sigma=1, mode="reflect", order="rc"), dict(), False, True),
-    ("hessian_matrix", dict(sigma=1, mode="reflect", order="rc"), dict(), False, True),
-    ("hessian_matrix_det", dict(sigma=1, approximate=False), dict(), False, True),
-    ("shape_index", dict(sigma=1, mode="reflect"), dict(), False, False),
-    ("corner_kitchen_rosenfeld", dict(mode="reflect"), dict(), False, False),
-    ("corner_harris", dict(k=0.05, eps=1e-6, sigma=1), dict(method=["k", "eps"]), False, False),
-    ("corner_shi_tomasi", dict(sigma=1), dict(), False, False),
-    ("corner_foerstner", dict(sigma=1), dict(), False, False),
-    ("corner_peaks", dict(), dict(min_distance=(2, 3, 5)), False, True),
-]:
-
-    for shape in [(128, 128, 128), (512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-
-        if function_name in ["corner_peaks", "peak_local_max"] and np.prod(shape) > 1000000:
-            # skip any large sizes that take too long
-            continue
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
+def main(args):
+
+    pfile = "cucim_feature_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        ("multiscale_basic_features", dict(edges=True), dict(texture=[True, False]), True, True),
+        ("canny", dict(sigma=1.8), dict(), False, False),
+        # reduced default rings, histograms, orientations to fit daisy at (3840, 2160) into GPU memory
+        (
+            "daisy",
+            dict(step=4, radius=15, rings=2, histograms=5, orientations=4),
+            dict(normalization=["l1", "l2", "daisy"]),
+            False,
+            False,
+        ),
+        ("structure_tensor", dict(sigma=1, mode="reflect", order="rc"), dict(), False, True),
+        ("hessian_matrix", dict(sigma=1, mode="reflect", order="rc"), dict(), False, True),
+        ("hessian_matrix_det", dict(sigma=1, approximate=False), dict(), False, True),
+        ("shape_index", dict(sigma=1, mode="reflect"), dict(), False, False),
+        ("corner_kitchen_rosenfeld", dict(mode="reflect"), dict(), False, False),
+        ("corner_harris", dict(k=0.05, eps=1e-6, sigma=1), dict(method=["k", "eps"]), False, False),
+        ("corner_shi_tomasi", dict(sigma=1), dict(), False, False),
+        ("corner_foerstner", dict(sigma=1), dict(), False, False),
+        ("corner_peaks", dict(), dict(min_distance=(2, 3, 5)), False, True),
+        ("match_template", dict(), dict(pad_input=[False], mode=["reflect"]), False, True)
+    ]:
+
+        if function_name == args.func_name:
+            shape = tuple(list(map(int,(args.img_size.split(',')))))
+        else:
             continue
 
-        if function_name == "multiscale_basic_features":
-            fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
-            if ndim == 3 and shape[-1] != 3:
-                # Omit texture=True case to avoid excessive GPU memory usage
-                var_kwargs["texture"] = [False]
-
-        B = ImageBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.feature,
-            module_gpu=cucim.skimage.feature,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    ("match_template", dict(), dict(pad_input=[False], mode=["reflect"]), False, True),
-]:
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-
+        #if function_name in ["corner_peaks", "peak_local_max"] and np.prod(shape) > 1000000:
+            # skip any large sizes that take too long
         ndim = len(shape)
-        if not allow_nd:
-            if allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
-
-        B = MatchTemplateBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.feature,
-            module_gpu=cucim.skimage.feature,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+        run_cpu = not args.no_cpu
+
+        if function_name != "match_template":
+            if not allow_nd:
+                if not allow_color:
+                    if ndim > 2:
+                        continue
+                else:
+                    if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
+                        continue
+
+            if shape[-1] == 3 and not allow_color:
+                continue
+
+            if function_name == "multiscale_basic_features":
+                fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
+                if ndim == 3 and shape[-1] != 3:
+                    # Omit texture=True case to avoid excessive GPU memory usage
+                    var_kwargs["texture"] = [False]
+
+            B = ImageBench(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.feature,
+                module_gpu=cucim.skimage.feature,
+                run_cpu=run_cpu,
+            )
+        else:
+            if not allow_nd:
+                if allow_color:
+                    if ndim > 2:
+                        continue
+                else:
+                    if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
+                        continue
+            if shape[-1] == 3 and not allow_color:
+                continue
+
+            B = MatchTemplateBench(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.feature,
+                module_gpu=cucim.skimage.feature,
+                run_cpu=run_cpu,
+            )
+
+        results = B.run_benchmark(duration=args.duration)
+        all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    with open(fbase + ".md", "wt") as f:
+        f.write(all_results.to_markdown())
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM Feature')
+    func_name_choices = ["multiscale_basic_features","canny","daisy","structure_tensor","hessian_matrix","hessian_matrix_det","shape_index","corner_kitchen_rosenfeld","corner_harris","corner_shi_tomasi","corner_foerstner","corner_peaks","match_template"]
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices=dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices=func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_filters_bench.py b/benchmarks/skimage/cucim_filters_bench.py
index c60c27816..e12a9a9e8 100644
--- a/benchmarks/skimage/cucim_filters_bench.py
+++ b/benchmarks/skimage/cucim_filters_bench.py
@@ -1,5 +1,6 @@
 import os
 import pickle
+import argparse
 
 import cucim.skimage
 import cucim.skimage.filters
@@ -10,107 +11,117 @@
 
 from _image_bench import ImageBench
 
-pfile = "cucim_filters_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _gabor.py
-    (
-        "gabor",
-        dict(n_stds=3),
-        dict(frequency=[0.075, 0.1, 0.2, 0.3]),
-        False,
-        False,
-    ),
-    # _gaussian.py
-    (
-        "gaussian",
-        dict(truncate=4.0, preserve_range=True),
-        dict(sigma=[0.25, 1, 4]),
-        True,
-        True,
-    ),
-    # _median.py
-    ("median", dict(mode="nearest"), dict(), False, True),
-    # _rank_order.py
-    ("rank_order", dict(), dict(), False, True),
-    # _unsharp_mask.py
-    (
-        "unsharp_mask",
-        dict(),
-        dict(radius=[0.5, 1.0, 2.0, 3.0]),
-        True,
-        True,
-    ),
-    # edges.py
-    ("sobel", dict(), dict(axis=[None, 0, -1]), False, True),
-    ("prewitt", dict(), dict(axis=[None, 0, -1]), False, True),
-    ("scharr", dict(), dict(axis=[None, 0, -1]), False, True),
-    ("roberts", dict(), dict(), False, False),
-    ("roberts_pos_diag", dict(), dict(), False, False),
-    ("roberts_neg_diag", dict(), dict(), False, False),
-    ("farid", dict(), dict(), False, False),
-    ("laplace", dict(ksize=3), dict(), False, True),
-    # lpi_filter.py
-    # TODO: benchmark wiener
-    # ridges.py
-    # TODO: had to set meijering, etc allow_nd to False just due to insufficient GPU memory
-    (
-        "meijering",
-        dict(sigmas=range(1, 10, 2), alpha=None),
-        dict(black_ridges=[True, False], mode=["reflect"]),
-        False,
-        False,
-    ),
-    (
-        "sato",
-        dict(sigmas=range(1, 10, 2)),
-        dict(black_ridges=[True, False], mode=["reflect"]),
-        False,
-        False,
-    ),
-    (
-        "frangi",
-        dict(sigmas=range(1, 10, 2)),
-        dict(black_ridges=[True, False], mode=["reflect"]),
-        False,
-        False,
-    ),
-    (
-        "hessian",
-        dict(sigmas=range(1, 10, 2)),
-        dict(black_ridges=[True, False], mode=["reflect"]),
-        False,
-        False,
-    ),
-    # thresholding.py
-    ("threshold_isodata", dict(), dict(nbins=[64, 256]), False, True),
-    ("threshold_otsu", dict(), dict(nbins=[64, 256]), False, True),
-    ("threshold_yen", dict(), dict(nbins=[64, 256]), False, True),
-    # TODO: threshold_local should support n-dimensional data
-    (
-        "threshold_local",
-        dict(),
-        dict(block_size=[5, 15], method=["gaussian", "mean", "median"]),
-        False,
-        False,
-    ),
-    ("threshold_li", dict(), dict(), False, True),
-    ("threshold_minimum", dict(), dict(nbins=[64, 256]), False, True),
-    ("threshold_mean", dict(), dict(), False, True),
-    ("threshold_triangle", dict(), dict(nbins=[64, 256]), False, True),
-    ("threshold_niblack", dict(), dict(window_size=[7, 15, 65]), False, True),
-    ("threshold_sauvola", dict(), dict(window_size=[7, 15, 65]), False, True),
-    ("apply_hysteresis_threshold", dict(low=0.15, high=0.6), dict(), False, True),
-    ("threshold_multiotsu", dict(), dict(nbins=[64, 256], classes=[3]), False, True),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+def main(args):
+
+    pfile = "cucim_filters_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+    dtypes = [np.dtype(args.dtype)]
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _gabor.py
+        (
+            "gabor",
+            dict(n_stds=3),
+            dict(frequency=[0.075, 0.1, 0.2, 0.3]),
+            False,
+            False,
+        ),
+        # _gaussian.py
+        (
+            "gaussian",
+            dict(truncate=4.0, preserve_range=True),
+            dict(sigma=[0.25, 1, 4]),
+            True,
+            True,
+        ),
+        # _median.py
+        ("median", dict(mode="nearest"), dict(), False, True),
+        # _rank_order.py
+        ("rank_order", dict(), dict(), False, True),
+        # _unsharp_mask.py
+        (
+            "unsharp_mask",
+            dict(),
+            dict(radius=[0.5, 1.0, 2.0, 3.0]),
+            True,
+            True,
+        ),
+        # edges.py
+        ("sobel", dict(), dict(axis=[None, 0, -1]), False, True),
+        ("prewitt", dict(), dict(axis=[None, 0, -1]), False, True),
+        ("scharr", dict(), dict(axis=[None, 0, -1]), False, True),
+        ("roberts", dict(), dict(), False, False),
+        ("roberts_pos_diag", dict(), dict(), False, False),
+        ("roberts_neg_diag", dict(), dict(), False, False),
+        ("farid", dict(), dict(), False, False),
+        ("laplace", dict(ksize=3), dict(), False, True),
+        # lpi_filter.py
+        # TODO: benchmark wiener
+        # ridges.py
+        # TODO: had to set meijering, etc allow_nd to False just due to insufficient GPU memory
+        (
+            "meijering",
+            dict(sigmas=range(1, 10, 2), alpha=None),
+            dict(black_ridges=[True, False], mode=["reflect"]),
+            False,
+            False,
+        ),
+        (
+            "sato",
+            dict(sigmas=range(1, 10, 2)),
+            dict(black_ridges=[True, False], mode=["reflect"]),
+            False,
+            False,
+        ),
+        (
+            "frangi",
+            dict(sigmas=range(1, 10, 2)),
+            dict(black_ridges=[True, False], mode=["reflect"]),
+            False,
+            False,
+        ),
+        (
+            "hessian",
+            dict(sigmas=range(1, 10, 2)),
+            dict(black_ridges=[True, False], mode=["reflect"]),
+            False,
+            False,
+        ),
+        # thresholding.py
+        ("threshold_isodata", dict(), dict(nbins=[64, 256]), False, True),
+        ("threshold_otsu", dict(), dict(nbins=[64, 256]), False, True),
+        ("threshold_yen", dict(), dict(nbins=[64, 256]), False, True),
+        # TODO: threshold_local should support n-dimensional data
+        (
+            "threshold_local",
+            dict(),
+            dict(block_size=[5, 15], method=["gaussian", "mean", "median"]),
+            False,
+            False,
+        ),
+        ("threshold_li", dict(), dict(), False, True),
+        ("threshold_minimum", dict(), dict(nbins=[64, 256]), False, True),
+        ("threshold_mean", dict(), dict(), False, True),
+        ("threshold_triangle", dict(), dict(nbins=[64, 256]), False, True),
+        ("threshold_niblack", dict(), dict(window_size=[7, 15, 65]), False, True),
+        ("threshold_sauvola", dict(), dict(window_size=[7, 15, 65]), False, True),
+        ("apply_hysteresis_threshold", dict(low=0.15, high=0.6), dict(), False, True),
+        ("threshold_multiotsu", dict(), dict(nbins=[64, 256], classes=[3]), False, True),
+    ]:
+        if function_name != args.func_name:
+            continue
+        else:
+            # image sizes/shapes
+            shape = tuple(list(map(int,(args.img_size.split(',')))))
+
+            # for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+
+        if function_name in ["gaussian", "unsharp_mask"]:
+            fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
 
         ndim = len(shape)
         if not allow_nd:
@@ -131,7 +142,7 @@
             footprints = []
             ndim = len(shape)
             footprint_sizes = [3, 5, 7, 9] if ndim == 2 else [3, 5, 7]
-            for footprint_size in [3, 5, 7, 9]:
+            for footprint_size in footprint_sizes:
                 footprints.append(
                     np.ones((footprint_size,) * ndim, dtype=bool)
                 )
@@ -148,12 +159,27 @@
             var_kwargs=var_kwargs,
             module_cpu=skimage.filters,
             module_gpu=cucim.skimage.filters,
+            run_cpu=not args.no_cpu,
         )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+        results = B.run_benchmark(duration=args.duration)
+        all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    with open(fbase + ".md", "wt") as f:
+        f.write(all_results.to_markdown())
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM Filters')
+    func_name_choices = ['gabor', 'gaussian', 'median', 'rank_order', 'unsharp_mask', 'sobel', 'prewitt', 'scharr', 'roberts', 'roberts_pos_diag', 'roberts_neg_diag', 'farid', 'laplace', 'meijering', 'sato', 'frangi', 'hessian', 'threshold_isodata', 'threshold_otsu', 'threshold_yen', 'threshold_local', 'threshold_li', 'threshold_minimum', 'threshold_mean', 'threshold_triangle', 'threshold_niblack', 'threshold_sauvola', 'apply_hysteresis_threshold', 'threshold_multiotsu']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices=dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices=func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_measure_bench.py b/benchmarks/skimage/cucim_measure_bench.py
index badb53e73..c5f207b64 100644
--- a/benchmarks/skimage/cucim_measure_bench.py
+++ b/benchmarks/skimage/cucim_measure_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import math
 import os
 import pickle
@@ -25,6 +26,7 @@ def __init__(
         index_str=None,
         module_cpu=skimage.measure,
         module_gpu=cucim.skimage.measure,
+        run_cpu=True,
     ):
 
         self.contiguous_labels = contiguous_labels
@@ -38,6 +40,7 @@ def __init__(
             index_str=index_str,
             module_cpu=module_cpu,
             module_gpu=module_gpu,
+            run_cpu=run_cpu,
         )
 
     def set_args(self, dtype):
@@ -49,7 +52,7 @@ def set_args(self, dtype):
                 [0, 0, 0, 0, 0, 5, 0, 0],
             ]
         )
-        tiling = tuple(s // a_s for s, a_s in zip(shape, a.shape))
+        tiling = tuple(s // a_s for s, a_s in zip(self.shape, a.shape))
         if self.contiguous_labels:
             image = np.kron(a, np.ones(tiling, dtype=a.dtype))
         else:
@@ -71,6 +74,7 @@ def __init__(
         index_str=None,
         module_cpu=skimage.measure,
         module_gpu=cucim.skimage.measure,
+        run_cpu=True,
     ):
 
         self.contiguous_labels = contiguous_labels
@@ -84,6 +88,7 @@ def __init__(
             index_str=index_str,
             module_cpu=module_cpu,
             module_gpu=module_gpu,
+            run_cpu=run_cpu,
         )
 
     def set_args(self, dtype):
@@ -95,7 +100,7 @@ def set_args(self, dtype):
                 [0, 0, 0, 0, 0, 5, 0, 0],
             ]
         )
-        tiling = tuple(s // a_s for s, a_s in zip(shape, a.shape))
+        tiling = tuple(s // a_s for s, a_s in zip(self.shape, a.shape))
         if self.contiguous_labels:
             image = np.kron(a, np.ones(tiling, dtype=a.dtype))
         else:
@@ -127,32 +132,70 @@ def set_args(self, dtype):
         self.args_gpu = (imaged,)
 
 
-pfile = "cucim_measure_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _gaussian.py
-    (
-        "label",
-        dict(return_num=False, background=0),
-        dict(connectivity=[1, 2]),
-        False,
-        True,
-    ),
-    # regionprops.py
-    ("regionprops", dict(), dict(), False, True),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+def main(args):
+
+    pfile = "cucim_measure_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _gaussian.py
+        (
+            "label",
+            dict(return_num=False, background=0),
+            dict(connectivity=[1, 2]),
+            False,
+            True,
+        ),
+        # regionprops.py
+        ("regionprops", dict(), dict(), False, True),
+        # _moments.py
+        ("moments", dict(), dict(order=[1, 2, 3, 4]), False, False),
+        ("moments_central", dict(), dict(order=[1, 2, 3]), False, True),
+        # omited from benchmarks (only tiny arrays): moments_normalized, moments_hu
+        ("centroid", dict(), dict(), False, True),
+        ("inertia_tensor", dict(), dict(), False, True),
+        ("inertia_tensor_eigvals", dict(), dict(), False, True),
+        # _polygon.py
+        # TODO: approximate_polygon, subdivide_polygon
+        # block.py
+        (
+            "block_reduce",
+            dict(),
+            dict(
+                func=[
+                    cp.sum,
+                ]
+            ),
+            True,
+            True,
+        ),  # variable block_size configured below
+        # entropy.py
+        ("shannon_entropy", dict(base=2), dict(), True, True),
+        # profile.py
+        (
+            "profile_line",
+            dict(src=(5, 7)),
+            dict(reduce_func=[cp.mean], linewidth=[1, 2, 4], order=[1, 3]),
+            True,
+            False,
+        ),  # variable block_size configured below
+    ]:
+
+        if function_name != args.func_name:
+            continue
 
         ndim = len(shape)
         if not allow_nd:
-            if allow_color:
+            if not allow_color:
                 if ndim > 2:
                     continue
             else:
@@ -161,118 +204,89 @@ def set_args(self, dtype):
         if shape[-1] == 3 and not allow_color:
             continue
 
-        Tester = LabelBench if function_name == "label" else RegionpropsBench
+        if function_name in ['label', 'regionprops']:
+
+            Tester = LabelBench if function_name == "label" else RegionpropsBench
+
+            for contiguous_labels in [True, False]:
+                if contiguous_labels:
+                    index_str = f"contiguous"
+                else:
+                    index_str = None
+                B = Tester(
+                    function_name=function_name,
+                    shape=shape,
+                    dtypes=dtypes,
+                    contiguous_labels=contiguous_labels,
+                    index_str=index_str,
+                    fixed_kwargs=fixed_kwargs,
+                    var_kwargs=var_kwargs,
+                    module_cpu=skimage.measure,
+                    module_gpu=cucim.skimage.measure,
+                    run_cpu=run_cpu,
+                )
+                results = B.run_benchmark(duration=args.duration)
+                all_results = pd.concat([all_results, results["full"]])
+        else:
+
 
-        for contiguous_labels in [True, False]:
-            if contiguous_labels:
-                index_str = f"contiguous"
-            else:
-                index_str = None
-            B = Tester(
+            if function_name == "gabor" and np.prod(shape) > 1000000:
+                # avoid cases that are too slow on the CPU
+                var_kwargs["frequency"] = [f for f in var_kwargs["frequency"] if f >= 0.1]
+
+            if function_name == "block_reduce":
+                ndim = len(shape)
+                if shape[-1] == 3:
+                    block_sizes = [(b,) * (ndim - 1) + (3,) for b in (16, 32, 64)]
+                else:
+                    block_sizes = [(b,) * ndim for b in (16, 32, 64)]
+                var_kwargs["block_size"] = block_sizes
+
+            if function_name == "profile_line":
+                fixed_kwargs["dst"] = (shape[0] - 32, shape[1] + 9)
+
+            if function_name == "median":
+                footprints = []
+                ndim = len(shape)
+                footprint_sizes = [3, 5, 7, 9] if ndim == 2 else [3, 5, 7]
+                for footprint_size in [3, 5, 7, 9]:
+                    footprints.append(
+                        np.ones((footprint_sizes,) * ndim, dtype=bool)
+                    )
+                var_kwargs["footprint"] = footprints
+
+            if function_name in ["gaussian", "unsharp_mask"]:
+                fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
+
+            B = FiltersBench(
                 function_name=function_name,
                 shape=shape,
                 dtypes=dtypes,
-                contiguous_labels=contiguous_labels,
-                index_str=index_str,
                 fixed_kwargs=fixed_kwargs,
                 var_kwargs=var_kwargs,
                 module_cpu=skimage.measure,
                 module_gpu=cucim.skimage.measure,
+                run_cpu=run_cpu,
             )
-            results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _moments.py
-    ("moments", dict(), dict(order=[1, 2, 3, 4]), False, False),
-    ("moments_central", dict(), dict(order=[1, 2, 3]), False, True),
-    # omited from benchmarks (only tiny arrays): moments_normalized, moments_hu
-    ("centroid", dict(), dict(), False, True),
-    ("inertia_tensor", dict(), dict(), False, True),
-    ("inertia_tensor_eigvals", dict(), dict(), False, True),
-    # _polygon.py
-    # TODO: approximate_polygon, subdivide_polygon
-    # block.py
-    (
-        "block_reduce",
-        dict(),
-        dict(
-            func=[
-                cp.sum,
-            ]
-        ),
-        True,
-        True,
-    ),  # variable block_size configured below
-    # entropy.py
-    ("shannon_entropy", dict(base=2), dict(), True, True),
-    # profile.py
-    (
-        "profile_line",
-        dict(src=(5, 7)),
-        dict(reduce_func=[cp.mean], linewidth=[1, 2, 4], order=[1, 3]),
-        True,
-        False,
-    ),  # variable block_size configured below
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
-
-        if function_name == "gabor" and np.prod(shape) > 1000000:
-            # avoid cases that are too slow on the CPU
-            var_kwargs["frequency"] = [f for f in var_kwargs["frequency"] if f >= 0.1]
-
-        if function_name == "block_reduce":
-            ndim = len(shape)
-            if shape[-1] == 3:
-                block_sizes = [(b,) * (ndim - 1) + (3,) for b in (16, 32, 64)]
-            else:
-                block_sizes = [(b,) * ndim for b in (16, 32, 64)]
-            var_kwargs["block_size"] = block_sizes
-
-        if function_name == "profile_line":
-            fixed_kwargs["dst"] = (shape[0] - 32, shape[1] + 9)
-
-        if function_name == "median":
-            footprints = []
-            ndim = len(shape)
-            footprint_sizes = [3, 5, 7, 9] if ndim == 2 else [3, 5, 7]
-            for footprint_size in [3, 5, 7, 9]:
-                footprints.append(
-                    np.ones((footprint_sizes,) * ndim, dtype=bool)
-                )
-            var_kwargs["footprint"] = footprints
-
-        if function_name in ["gaussian", "unsharp_mask"]:
-            fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
-
-        B = FiltersBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.measure,
-            module_gpu=cucim.skimage.measure,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    with open(fbase + ".md", "wt") as f:
+        f.write(all_results.to_markdown())
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM measure functions')
+    func_name_choices = ['label', 'regionprops', 'moments', 'moments_central', 'centroid', 'inertia_tensor', 'inertia_tensor_eigvals', 'block_reduce', 'shannon_entropy', 'profile_line']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices=dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices=func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_metrics_bench.py b/benchmarks/skimage/cucim_metrics_bench.py
index 713c78fbd..af6b94814 100644
--- a/benchmarks/skimage/cucim_metrics_bench.py
+++ b/benchmarks/skimage/cucim_metrics_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import pickle
 
@@ -17,44 +18,51 @@ def set_args(self, dtype):
         imaged = cp.testing.shaped_arange(self.shape, dtype=dtype)
         imaged2 = cp.testing.shaped_arange(self.shape, dtype=dtype)
         imaged2 = imaged2 + 0.05 * cp.random.standard_normal(self.shape)
-        imaged /= imaged.max()
-        imaged2 /= imaged2.max()
+        imaged = imaged / imaged.max()
+        imaged2 = imaged2 / imaged2.max()
         imaged2 = imaged2.clip(0, 1.0)
         self.args_cpu = (cp.asnumpy(imaged), cp.asnumpy(imaged2))
         self.args_gpu = (imaged, imaged2)
 
 
-pfile = "cucim_metrics_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _structural_similarity.py
-    (
-        "structural_similarity",
-        dict(data_range=1.0),
-        dict(gradient=[False, True], gaussian_weights=[False, True]),
-        True,
-        True,
-    ),
-    # simple_metrics.py
-    ("mean_squared_error", dict(), dict(), True, True),
-    (
-        "normalized_root_mse",
-        dict(),
-        dict(normalization=["euclidean", "min-max", "mean"]),
-        True,
-        True,
-    ),
-    ("peak_signal_noise_ratio", dict(data_range=1.0), dict(), True, True),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+def main(args):
+
+    pfile = "cucim_metrics_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _structural_similarity.py
+        (
+            "structural_similarity",
+            dict(data_range=1.0),
+            dict(gradient=[False, True], gaussian_weights=[False, True]),
+            True,
+            True,
+        ),
+        # simple_metrics.py
+        ("mean_squared_error", dict(), dict(), True, True),
+        (
+            "normalized_root_mse",
+            dict(),
+            dict(normalization=["euclidean", "min-max", "mean"]),
+            True,
+            True,
+        ),
+        ("peak_signal_noise_ratio", dict(data_range=1.0), dict(), True, True),
+        ("normalized_mutual_information", dict(bins=100), dict(), True, True),
+
+    ]:
+        if function_name != args.func_name:
+            continue
 
         ndim = len(shape)
         if not allow_nd:
@@ -78,13 +86,28 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
             module_cpu=skimage.metrics,
             module_gpu=cucim.skimage.metrics,
+            run_cpu=run_cpu,
         )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        results = B.run_benchmark(duration=args.duration)
+        all_results = pd.concat([all_results, results["full"]])
+
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    with open(fbase + ".md", "wt") as f:
+        f.write(all_results.to_markdown())
+
 
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM metrics functions')
+    func_name_choices = ['structural_similarity', 'mean_squared_error', 'normalized_root_mse', 'peak_signal_noise_ratio', 'normalized_mutual_information']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices=dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices=func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
 
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_morphology_bench.py b/benchmarks/skimage/cucim_morphology_bench.py
index dd8fbed4c..8fe1d6a95 100644
--- a/benchmarks/skimage/cucim_morphology_bench.py
+++ b/benchmarks/skimage/cucim_morphology_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import copy
 import functools
 import math
@@ -30,6 +31,7 @@ def __init__(
         var_kwargs={},
         module_cpu=skimage.morphology,
         module_gpu=cucim.skimage.morphology,
+        run_cpu=True,
     ):
 
         array_kwargs = dict(footprint=footprint)
@@ -47,6 +49,7 @@ def __init__(
             index_str=index_str,
             module_cpu=module_cpu,
             module_gpu=module_gpu,
+            run_cpu=run_cpu,
         )
 
     def set_args(self, dtype):
@@ -104,169 +107,169 @@ def set_args(self, dtype):
         self.args_gpu = (a, 5)
 
 
-pfile = "cucim_morphology_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes_gray = [np.float32]
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_nd in [
-    ("thin", dict(), dict(), True),
-]:
-
-    for shape in [(512, 512), (3840, 2160)]:
-
-        ndim = len(shape)
-        if ndim != 2:
-            raise ValueError("only 2d benchmark data has been implemented")
-
-        if not allow_nd and ndim > 2:
+def main(args):
+
+    pfile = "cucim_morphology_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # binary.py
+        ("binary_erosion", dict(), dict(), False, True),
+        ("binary_dilation", dict(), dict(), False, True),
+        ("binary_opening", dict(), dict(), False, True),
+        ("binary_closing", dict(), dict(), False, True),
+        # misc.py
+        ("remove_small_objects", dict(), dict(), False, True),
+        ("remove_small_holes", dict(), dict(), False, True),
+        # gray.py
+        ("erosion", dict(), dict(), False, True),
+        ("dilation", dict(), dict(), False, True),
+        ("opening", dict(), dict(), False, True),
+        ("closing", dict(), dict(), False, True),
+        ("white_tophat", dict(), dict(), False, True),
+        ("black_tophat", dict(), dict(), False, True),
+        # _skeletonize.py
+        ("thin", dict(), dict(), False, True),
+        # grayreconstruct.py
+        ("reconstruction", dict(), dict(), False, True),
+        # footprints.py
+        # OMIT the functions from this file (each creates a structuring element)
+    ]:
+
+        if function_name != args.func_name:
             continue
 
-
-        B = SkeletonizeBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=[bool],
-            fixed_kwargs={},
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.morphology,
-            module_gpu=cucim.skimage.morphology,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_nd in [
-    ("binary_erosion", dict(), dict(), True),
-    ("binary_dilation", dict(), dict(), True),
-    ("binary_opening", dict(), dict(), True),
-    ("binary_closing", dict(), dict(), True),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (192, 192, 192)]:
-
         ndim = len(shape)
-        if not allow_nd and ndim > 2:
-            continue
+        if function_name == 'thin':
+            if ndim != 2:
+                raise ValueError("only 2d benchmark data has been implemented")
 
-        for connectivity in range(1, ndim + 1):
-            index_str = f"conn={connectivity}"
-            footprint = ndi.generate_binary_structure(ndim, connectivity)
+            if not allow_nd and ndim > 2:
+                continue
 
-            B = BinaryMorphologyBench(
+            B = SkeletonizeBench(
                 function_name=function_name,
                 shape=shape,
                 dtypes=[bool],
-                footprint=footprint,
                 fixed_kwargs={},
                 var_kwargs=var_kwargs,
-                index_str=index_str,
                 module_cpu=skimage.morphology,
                 module_gpu=cucim.skimage.morphology,
+                run_cpu=run_cpu,
             )
-            results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
-
 
-for function_name, fixed_kwargs, var_kwargs, allow_nd in [
-    # misc.py
-    ("remove_small_objects", dict(), dict(), True),
-    ("remove_small_holes", dict(), dict(), True),
-]:
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+        elif function_name.startswith('binary_'):
+
+            if not allow_nd and ndim > 2:
+                continue
+
+            for connectivity in range(1, ndim + 1):
+                index_str = f"conn={connectivity}"
+                footprint = ndi.generate_binary_structure(ndim, connectivity)
+
+                B = BinaryMorphologyBench(
+                    function_name=function_name,
+                    shape=shape,
+                    dtypes=[bool],
+                    footprint=footprint,
+                    fixed_kwargs={},
+                    var_kwargs=var_kwargs,
+                    index_str=index_str,
+                    module_cpu=skimage.morphology,
+                    module_gpu=cucim.skimage.morphology,
+                    run_cpu=run_cpu,
+                )
+                results = B.run_benchmark(duration=args.duration)
+                all_results = pd.concat([all_results, results["full"]])
 
-    for shape in [(512, 512), (3840, 2160), (192, 192, 192)]:
 
-        ndim = len(shape)
-        if not allow_nd and ndim > 2:
-            continue
+        elif function_name in ['remove_small_holes', 'remove_small_objects']:
+            if not allow_nd and ndim > 2:
+                continue
 
-        if function_name == "remove_small_objects":
-            TestClass = RemoveSmallObjectsBench
-        elif function_name == "remove_small_holes":
-            TestClass = RemoveSmallHolesBench
-        else:
-            raise ValueError(f"unknown function: {function_name}")
-        B = TestClass(
-            function_name=function_name,
-            shape=shape,
-            dtypes=[bool],
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.morphology,
-            module_gpu=cucim.skimage.morphology,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # grey.py
-    ("erosion", dict(), dict(), False, True),
-    ("dilation", dict(), dict(), False, True),
-    ("opening", dict(), dict(), False, True),
-    ("closing", dict(), dict(), False, True),
-    ("white_tophat", dict(), dict(), False, True),
-    ("black_tophat", dict(), dict(), False, True),
-    # greyreconstruct.py
-    ("reconstruction", dict(), dict(), False, True),
-    # footprints.py
-    # OMIT the functions from this file (each creates a structuring element)
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+            if function_name == "remove_small_objects":
+                TestClass = RemoveSmallObjectsBench
+            elif function_name == "remove_small_holes":
+                TestClass = RemoveSmallHolesBench
 
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
             else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
-
-        if function_name == "gabor" and np.prod(shape) > 1000000:
-            # avoid cases that are too slow on the CPU
-            var_kwargs["frequency"] = [f for f in var_kwargs["frequency"] if f >= 0.1]
-
-        if function_name == "median":
-            footprints = []
-            ndim = len(shape)
-            footprint_sizes = [3, 5, 7, 9] if ndim == 2 else [3, 5, 7]
-            for footprint_size in [3, 5, 7, 9]:
-                footprints.append(
-                    np.ones((footprint_sizes,) * ndim, dtype=bool)
-                )
-            var_kwargs["footprint"] = footprints
-
-        if function_name in ["gaussian", "unsharp_mask"]:
-            fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
+                raise ValueError(f"unknown function: {function_name}")
+            B = TestClass(
+                function_name=function_name,
+                shape=shape,
+                dtypes=[bool],
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.morphology,
+                module_gpu=cucim.skimage.morphology,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
 
-        if function_name == "reconstruction":
-            TestClass = ReconstructionBench
         else:
-            TestClass = ImageBench
 
-        B = TestClass(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes_gray,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.morphology,
-            module_gpu=cucim.skimage.morphology,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+            if not allow_nd:
+                if not allow_color:
+                    if ndim > 2:
+                        continue
+                else:
+                    if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
+                        continue
+
+            if shape[-1] == 3 and not allow_color:
+                continue
 
+            if function_name == "reconstruction":
+                TestClass = ReconstructionBench
+            else:
+                TestClass = ImageBench
 
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+            B = TestClass(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.morphology,
+                module_gpu=cucim.skimage.morphology,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM morphology functions')
+    func_name_choices = ['binary_erosion', 'binary_dilation', 'binary_opening', 'binary_closing', 'remove_small_objects', 'remove_small_holes', 'erosion', 'dilation', 'opening', 'closing', 'white_tophat', 'black_tophat', 'thin', 'reconstruction']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_registration_bench.py b/benchmarks/skimage/cucim_registration_bench.py
index d90bd16a4..6d275dcec 100644
--- a/benchmarks/skimage/cucim_registration_bench.py
+++ b/benchmarks/skimage/cucim_registration_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import math
 import os
 import pickle
@@ -33,48 +34,93 @@ def set_args(self, dtype):
         self.args_gpu = (imaged, imaged2)
 
 
-pfile = "cucim_registration_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _phase_cross_correlation.py
-    ("phase_cross_correlation", dict(), dict(), False, True),
-]:
+def main(args):
+
+    pfile = "cucim_registration_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _phase_cross_correlation.py
+        ("phase_cross_correlation", dict(), dict(), False, True),
+        # optical flow functions
+        ("optical_flow_tvl1", dict(), dict(num_iter=[10], num_warp=[5]), False, True),
+        (
+            "optical_flow_ilk",
+            dict(),
+            dict(radius=[3, 7], num_warp=[10], gaussian=[False, True], prefilter=[False, True]),
+            False,
+            True,
+        ),
+
+    ]:
+
+        if function_name != args.func_name:
+            continue
 
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+        if function_name == 'phase_cross_correlation':
+
+            ndim = len(shape)
+            if not allow_nd:
+                if not allow_color:
+                    if ndim > 2:
+                        continue
+                else:
+                    if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
+                        continue
+            if shape[-1] == 3 and not allow_color:
+                continue
+
+            for masked in [True, False]:
+
+                index_str = f"masked={masked}"
+                if masked:
+                    moving_mask = cp.ones(shape, dtype=bool)
+                    moving_mask[20:-20, :] = 0
+                    moving_mask[:, 20:-20] = 0
+                    reference_mask = cp.ones(shape, dtype=bool)
+                    reference_mask[80:-80, :] = 0
+                    reference_mask[:, 80:-80] = 0
+                    fixed_kwargs["moving_mask"] = moving_mask
+                    fixed_kwargs["reference_mask"] = reference_mask
+                else:
+                    fixed_kwargs["moving_mask"] = None
+                    fixed_kwargs["reference_mask"] = None
+
+                B = RegistrationBench(
+                    function_name=function_name,
+                    shape=shape,
+                    dtypes=dtypes,
+                    fixed_kwargs=fixed_kwargs,
+                    var_kwargs=var_kwargs,
+                    index_str=index_str,
+                    module_cpu=skimage.registration,
+                    module_gpu=cucim.skimage.registration,
+                    run_cpu=run_cpu,
+                )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
 
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
+        else:
 
-        for masked in [True, False]:
-
-            index_str = f"masked={masked}"
-            if masked:
-                moving_mask = cp.ones(shape, dtype=bool)
-                moving_mask[20:-20, :] = 0
-                moving_mask[:, 20:-20] = 0
-                reference_mask = cp.ones(shape, dtype=bool)
-                reference_mask[80:-80, :] = 0
-                reference_mask[:, 80:-80] = 0
-                fixed_kwargs["moving_mask"] = moving_mask
-                fixed_kwargs["reference_mask"] = reference_mask
-            else:
-                fixed_kwargs["moving_mask"] = None
-                fixed_kwargs["reference_mask"] = None
+            ndim = len(shape)
+            if not allow_nd:
+                if not allow_color:
+                    if ndim > 2:
+                        continue
+                else:
+                    if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
+                        continue
+            if shape[-1] == 3 and not allow_color:
+                continue
 
             B = RegistrationBench(
                 function_name=function_name,
@@ -82,54 +128,35 @@ def set_args(self, dtype):
                 dtypes=dtypes,
                 fixed_kwargs=fixed_kwargs,
                 var_kwargs=var_kwargs,
-                index_str=index_str,
                 module_cpu=skimage.registration,
                 module_gpu=cucim.skimage.registration,
+                run_cpu=run_cpu,
             )
-            results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
-
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _phase_cross_correlation.py
-    ("optical_flow_tvl1", dict(), dict(num_iter=[10], num_warp=[5]), False, True),
-    (
-        "optical_flow_ilk",
-        dict(),
-        dict(radius=[3, 7], num_warp=[10], gaussian=[False, True], prefilter=[False, True]),
-        False,
-        True,
-    ),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
-
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
-
-        B = RegistrationBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.registration,
-            module_gpu=cucim.skimage.registration,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM registration functions')
+    func_name_choices = ['phase_cross_correlation', 'optical_flow_tvl1', 'optical_flow_ilk']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_restoration_bench.py b/benchmarks/skimage/cucim_restoration_bench.py
index e20be1b9f..a1523dbae 100644
--- a/benchmarks/skimage/cucim_restoration_bench.py
+++ b/benchmarks/skimage/cucim_restoration_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import math
 import os
 import pickle
@@ -97,65 +98,35 @@ def set_args(self, dtype):
         self.args_gpu = (imaged, psfd)
 
 
-pfile = "cucim_restoration_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
 
+def main(args):
 
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _denoise.py
-    ("denoise_tv_chambolle", dict(), dict(weight=[0.02]), True, True),
-    # j_invariant.py
-    ("calibrate_denoiser", dict(), dict(), False, True),
-]:
+    pfile = "cucim_restoration_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
 
-    for shape in [(512, 512), (1980, 1080), (1980, 1080, 3), (128, 128, 128)]:
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
 
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
-            else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
 
-        if function_name == "denoise_tv_chambolle":
-            fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _denoise.py
+        ("denoise_tv_chambolle", dict(), dict(weight=[0.02]), True, True),
+        # j_invariant.py
+        ("calibrate_denoiser", dict(), dict(), False, True),
+        # deconvolution.py
+        ("wiener", dict(balance=100.0), dict(), False, False),
+        ("unsupervised_wiener", dict(), dict(), False, False),
+        ("richardson_lucy", dict(), dict(num_iter=[5]), False, True),
+    ]:
 
-        if function_name == "calibrate_denoiser":
-            denoise_class = CalibratedDenoiseBench
-        else:
-            denoise_class = DenoiseBench
-
-        B = denoise_class(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.restoration,
-            module_gpu=cucim.skimage.restoration,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-# function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd = ('unsupervised_wiener', dict(), dict(), False, True)
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # deconvolution.py
-    ("wiener", dict(balance=100.0), dict(), False, False),
-    ("unsupervised_wiener", dict(), dict(), False, False),
-    ("richardson_lucy", dict(), dict(num_iter=[5]), False, True),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+        if function_name != args.func_name:
+            continue
 
         ndim = len(shape)
         if not allow_nd:
@@ -168,21 +139,65 @@ def set_args(self, dtype):
         if shape[-1] == 3 and not allow_color:
             continue
 
-        B = DeconvolutionBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.restoration,
-            module_gpu=cucim.skimage.restoration,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+        if function_name in ['denoise_tv_chambolle', 'calibrate_denoiser']:
+
+            if function_name == "denoise_tv_chambolle":
+                fixed_kwargs["channel_axis"] = -1 if shape[-1] == 3 else None
+
+            if function_name == "calibrate_denoiser":
+                denoise_class = CalibratedDenoiseBench
+            else:
+                denoise_class = DenoiseBench
+
+            B = denoise_class(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.restoration,
+                module_gpu=cucim.skimage.restoration,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+        elif function_name in ['wiener', 'unsupervised_wiener', 'richardson_lucy']:
+
+            B = DeconvolutionBench(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.restoration,
+                module_gpu=cucim.skimage.restoration,
+                run_cpu=run_cpu,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM restoration functions')
+    func_name_choices = ['denoise_tv_chambolle', 'calibrate_denoiser', 'wiener', 'unsupervised_wiener', 'richardson_lucy']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_segmentation_bench.py b/benchmarks/skimage/cucim_segmentation_bench.py
index b770cdb36..382349a9b 100644
--- a/benchmarks/skimage/cucim_segmentation_bench.py
+++ b/benchmarks/skimage/cucim_segmentation_bench.py
@@ -1,8 +1,11 @@
+import argparse
 import math
 import os
 import pickle
 
 import cucim.skimage
+import cucim.skimage.data
+import cucim.skimage.exposure
 import cucim.skimage.segmentation
 import cupy as cp
 import numpy as np
@@ -25,6 +28,7 @@ def __init__(
         index_str=None,
         module_cpu=skimage.measure,
         module_gpu=cucim.skimage.measure,
+        run_cpu=True,
     ):
 
         self.contiguous_labels = contiguous_labels
@@ -38,6 +42,7 @@ def __init__(
             index_str=index_str,
             module_cpu=module_cpu,
             module_gpu=module_gpu,
+            run_cpu=run_cpu,
         )
 
     def set_args(self, dtype):
@@ -50,14 +55,38 @@ def set_args(self, dtype):
             ],
             dtype=dtype,
         )
-        tiling = tuple(s // a_s for s, a_s in zip(shape, a.shape))
+        tiling = tuple(s // a_s for s, a_s in zip(self.shape, a.shape))
         if self.contiguous_labels:
-            image = np.kron(a, np.ones(tiling, dtype=a.dtype))
+            labels = np.kron(a, np.ones(tiling, dtype=a.dtype))
         else:
-            image = np.tile(a, tiling)
-        imaged = cp.asarray(image)
-        self.args_cpu = (image,)
-        self.args_gpu = (imaged,)
+            labels = np.tile(a, tiling)
+        labels_d = cp.asarray(labels)
+        self.args_cpu = (labels,)
+        self.args_gpu = (labels_d,)
+
+
+class LabelAndImageBench(LabelBench):
+
+    def set_args(self, dtype):
+        a = np.array(
+            [
+                [0, 0, 1, 1, 0, 0, 0, 0],
+                [0, 0, 0, 1, 0, 0, 4, 0],
+                [2, 2, 0, 0, 3, 0, 4, 4],
+                [0, 0, 0, 0, 0, 5, 0, 0],
+            ],
+            dtype=dtype,
+        )
+        tiling = tuple(s // a_s for s, a_s in zip(self.shape, a.shape))
+        if self.contiguous_labels:
+            labels = np.kron(a, np.ones(tiling, dtype=a.dtype))
+        else:
+            labels = np.tile(a, tiling)
+        labels_d = cp.asarray(labels)
+        image_d = cp.random.standard_normal(labels.shape).astype(np.float32)
+        image = cp.asnumpy(image_d)
+        self.args_cpu = (image, labels)
+        self.args_gpu = (image_d, labels_d)
 
 
 class MorphGeodesicBench(ImageBench):
@@ -75,7 +104,9 @@ def set_args(self, dtype):
         imaged = cp.tile(im1, n_tile)[slices]
 
         # need this preprocessing for morphological_geodesic_active_contour
-        imaged = skimage.segmentation.inverse_gaussian_gradient(imaged)
+        imaged = cp.array(
+            skimage.segmentation.inverse_gaussian_gradient(cp.asnumpy(imaged))
+        )
 
         image = cp.asnumpy(imaged)
         assert imaged.dtype == dtype
@@ -84,29 +115,118 @@ def set_args(self, dtype):
         self.args_gpu = (imaged,)
 
 
-pfile = "cucim_segmentation_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.int32]
+class RandomWalkerBench(ImageBench):
+
+
+    def set_args(self, dtype):
+        # Note: dtype only used for merkers array, data is hard-coded as float32
+
+        if np.dtype(dtype).kind not in 'iu':
+            raise ValueError("random_walker markers require integer dtype")
+
+        n_dim = len(self.shape)
+        data = cucim.skimage.img_as_float(
+            cucim.skimage.data.binary_blobs(
+                length=max(self.shape), n_dim=n_dim, seed=1
+            )
+        )
+        data = data[tuple(slice(s) for s in self.shape)]
+        sigma = 0.35
+        rng = np.random.default_rng(5)
+        data += cp.array(rng.normal(loc=0, scale=sigma, size=data.shape))
+        data = cucim.skimage.exposure.rescale_intensity(
+            data, in_range=(-sigma, 1 + sigma), out_range=(-1, 1)
+        )
+        data = data.astype(cp.float32)
+        data_cpu = cp.asnumpy(data)
+
+        # The range of the binary image spans over (-1, 1).
+        # We choose the hottest and the coldest pixels as markers.
+        markers = cp.zeros(data.shape, dtype=dtype)
+        markers[data < -0.95] = 1
+        markers[data > 0.95] = 2
+        markers_cpu = cp.asnumpy(markers)
+        self.args_cpu = (data_cpu, markers_cpu)
+        self.args_gpu = (data, markers)
+
+
+
+def main(args):
+
+    pfile = "cucim_segmentation_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
 
+    dtypes = [np.dtype(args.dtype)]
+    dtypes_label = [np.dtype(args.dtype_label)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
 
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _denoise.py
-    (
-        "find_boundaries",
-        dict(),
-        dict(connectivity=[1], mode=["thick", "inner", "outer", "subpixel"]),
-        False,
-        True,
-    ),
-]:
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _clear_border.py
+        (
+            "clear_border",
+            dict(),
+            dict(),
+            False,
+            True,
+        ),
+        # _join.py
+        (
+            "relabel_sequential",
+            dict(offset=5),
+            dict(),
+            False,
+            True,
+        ),
+        # boundaries.py
+        (
+            "find_boundaries",
+            dict(),
+            dict(connectivity=[1], mode=["thick", "inner", "outer", "subpixel"]),
+            False,
+            True,
+        ),
+        (
+            "mark_boundaries",
+            dict(),
+            dict(),
+            False,
+            True,
+        ),
+        (
+            "random_walker",
+            dict(beta=4, tol=1.e-4, prob_tol=1.e-2),
+            dict(mode=['cg', 'cg_j']),
+            False,
+            True,
+        ),
+        # morphsnakes.py
+        ("inverse_gaussian_gradient", dict(), dict(), False, True),
+        (
+            "morphological_geodesic_active_contour",
+            dict(),
+            dict(num_iter=[16], init_level_set=["checkerboard", "disk"]),
+            False,
+            False,
+        ),
+        (
+            "morphological_chan_vese",
+            dict(),
+            dict(num_iter=[16], init_level_set=["checkerboard", "disk"]),
+            False,
+            False,
+        ),
+        # omit: disk_level_set (simple array generation function)
+        # omit: checkerboard_level_set (simple array generation function)
+    ]:
 
-    for shape in [
-        (64, 64),
-    ]:  # (512, 512), (1980, 1080), (1980, 1080, 3), (128, 128, 128)]:
+        if function_name != args.func_name:
+            continue
 
         ndim = len(shape)
         if not allow_nd:
@@ -119,73 +239,72 @@ def set_args(self, dtype):
         if shape[-1] == 3 and not allow_color:
             continue
 
-        B = LabelBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.segmentation,
-            module_gpu=cucim.skimage.segmentation,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
-
-
-dtypes = [np.float32]
-# function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd = ('unsupervised_wiener', dict(), dict(), False, True)
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # morphsnakes.py
-    ("inverse_gaussian_gradient", dict(), dict(), False, True),
-    (
-        "morphological_geodesic_active_contour",
-        dict(),
-        dict(num_iter=[16], init_level_set=["checkerboard", "disk"]),
-        False,
-        False,
-    ),
-    (
-        "morphological_chan_vese",
-        dict(),
-        dict(num_iter=[16], init_level_set=["checkerboard", "disk"]),
-        False,
-        False,
-    ),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+        if function_name in ["clear_border", "relabel_sequential", "find_boundaries", "mark_boundaries", "random_walker"]:
+            if function_name == 'random_walker':
+                fixed_kwargs['channel_axis'] = -1 if shape[-1] == 3 else None
 
-        ndim = len(shape)
-        if not allow_nd:
-            if not allow_color:
-                if ndim > 2:
-                    continue
+            if function_name == 'mark_boundaries':
+                bench_func = LabelAndImageBench
+            elif function_name == 'random_walker':
+                bench_func = RandomWalkerBench
             else:
-                if ndim > 3 or (ndim == 3 and shape[-1] not in [3, 4]):
-                    continue
-        if shape[-1] == 3 and not allow_color:
-            continue
+                bench_func = LabelBench
 
-        if function_name == "morphological_geodesic_active_contour":
-            bench_class = MorphGeodesicBench
-        else:
-            bench_class = ImageBench
+            B = bench_func(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes_label,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.segmentation,
+                module_gpu=cucim.skimage.segmentation,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+
+        elif function_name in ["inverse_gaussian_gradient", "morphological_geodesic_active_contour", "morphological_chan_vese"]:
+
+            if function_name == "morphological_geodesic_active_contour":
+                bench_class = MorphGeodesicBench
+            else:
+                bench_class = ImageBench
+
+            B = bench_class(
+                function_name=function_name,
+                shape=shape,
+                dtypes=dtypes,
+                fixed_kwargs=fixed_kwargs,
+                var_kwargs=var_kwargs,
+                module_cpu=skimage.segmentation,
+                module_gpu=cucim.skimage.segmentation,
+            )
+            results = B.run_benchmark(duration=args.duration)
+            all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
 
-        B = ImageBench(
-            function_name=function_name,
-            shape=shape,
-            dtypes=dtypes,
-            fixed_kwargs=fixed_kwargs,
-            var_kwargs=var_kwargs,
-            module_cpu=skimage.segmentation,
-            module_gpu=cucim.skimage.segmentation,
-        )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
 
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM segmentation functions')
+    func_name_choices = ["clear_border", "relabel_sequential", "find_boundaries", "mark_boundaries", "random_walker", "inverse_gaussian_gradient", "morphological_geodesic_active_contour", "morphological_chan_vese"]
+    label_dtype_choices = ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('--dtype_label', type=str, help='Dtype of input image', choices = label_dtype_choices, required=False, default='uint8')
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
 
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cucim_transform_bench.py b/benchmarks/skimage/cucim_transform_bench.py
index 534a6a225..b6aa315f9 100644
--- a/benchmarks/skimage/cucim_transform_bench.py
+++ b/benchmarks/skimage/cucim_transform_bench.py
@@ -1,3 +1,4 @@
+import argparse
 import os
 import pickle
 
@@ -10,75 +11,83 @@
 
 from _image_bench import ImageBench
 
-pfile = "cucim_transform_results.pickle"
-if os.path.exists(pfile):
-    with open(pfile, "rb") as f:
-        all_results = pickle.load(f)
-else:
-    all_results = pd.DataFrame()
-dtypes = [np.float32]
-
-for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
-    # _warps.py
-    (
-        "resize",
-        dict(preserve_range=True),
-        dict(order=[0, 1, 3], mode=["reflect"], anti_aliasing=[True]),
-        True,
-        True,
-    ),  # scale handled in loop below
-    (
-        "resize_local_mean",
-        dict(preserve_range=True),
-        {},
-        True,
-        True,
-    ),  # scale handled in loop below
-    (
-        "rescale",
-        dict(preserve_range=True),
-        dict(order=[0, 1, 3], mode=["reflect"], anti_aliasing=[True]),
-        True,
-        True,
-    ),  # output_shape handled in loop below
-    (
-        "rotate",
-        dict(angle=15, preserve_range=True),
-        dict(order=[0, 1, 3], mode=["reflect"], resize=[False, True]),
-        False,
-        False,
-    ),
-    ("downscale_local_mean", dict(), dict(), True, True),  # factors handled in loop below
-    (
-        "swirl",
-        dict(strength=1, preserve_range=True),
-        dict(order=[0, 1, 3], mode=["reflect"]),
-        False,
-        False,
-    ),
-    # TODO : warp? already indirectly benchmarked via swirl, etc
-    ("warp_polar", dict(), dict(scaling=["linear", "log"]), True, False),
-    # integral.py
-    ("integral_image", dict(), dict(), False, True),
-    # TODO: integrate
-    # pyramids.py
-    (
-        "pyramid_gaussian",
-        dict(max_layer=6, downscale=2, preserve_range=True),
-        dict(order=[0, 1, 3]),
-        True,
-        True,
-    ),
-    (
-        "pyramid_laplacian",
-        dict(max_layer=6, downscale=2, preserve_range=True),
-        dict(order=[0, 1, 3]),
-        True,
-        True,
-    ),
-]:
-
-    for shape in [(512, 512), (3840, 2160), (3840, 2160, 3), (192, 192, 192)]:
+
+def main(args):
+
+    pfile = "cucim_transform_results.pickle"
+    if os.path.exists(pfile):
+        with open(pfile, "rb") as f:
+            all_results = pickle.load(f)
+    else:
+        all_results = pd.DataFrame()
+
+    dtypes = [np.dtype(args.dtype)]
+    # image sizes/shapes
+    shape = tuple(list(map(int,(args.img_size.split(',')))))
+    run_cpu = not args.no_cpu
+
+    for function_name, fixed_kwargs, var_kwargs, allow_color, allow_nd in [
+        # _warps.py
+        (
+            "resize",
+            dict(preserve_range=True),
+            dict(order=[0, 1, 3], mode=["reflect"], anti_aliasing=[True]),
+            True,
+            True,
+        ),  # scale handled in loop below
+        (
+            "resize_local_mean",
+            dict(preserve_range=True),
+            {},
+            True,
+            True,
+        ),  # scale handled in loop below
+        (
+            "rescale",
+            dict(preserve_range=True),
+            dict(order=[0, 1, 3], mode=["reflect"], anti_aliasing=[True]),
+            True,
+            True,
+        ),  # output_shape handled in loop below
+        (
+            "rotate",
+            dict(angle=15, preserve_range=True),
+            dict(order=[0, 1, 3], mode=["reflect"], resize=[False, True]),
+            False,
+            False,
+        ),
+        ("downscale_local_mean", dict(), dict(), True, True),  # factors handled in loop below
+        (
+            "swirl",
+            dict(strength=1, preserve_range=True),
+            dict(order=[0, 1, 3], mode=["reflect"]),
+            False,
+            False,
+        ),
+        # TODO : warp? already indirectly benchmarked via swirl, etc
+        ("warp_polar", dict(), dict(scaling=["linear", "log"]), True, False),
+        # integral.py
+        ("integral_image", dict(), dict(), False, True),
+        # TODO: integrate
+        # pyramids.py
+        (
+            "pyramid_gaussian",
+            dict(max_layer=6, downscale=2, preserve_range=True),
+            dict(order=[0, 1, 3]),
+            True,
+            True,
+        ),
+        (
+            "pyramid_laplacian",
+            dict(max_layer=6, downscale=2, preserve_range=True),
+            dict(order=[0, 1, 3]),
+            True,
+            True,
+        ),
+    ]:
+
+        if function_name != args.func_name:
+            continue
 
         ndim = len(shape)
         if not allow_nd:
@@ -131,12 +140,30 @@
             module_gpu=cucim.skimage.transform,
             function_is_generator=function_is_generator,
         )
-        results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        results = B.run_benchmark(duration=args.duration)
+        all_results = pd.concat([all_results, results["full"]])
+
+    fbase = os.path.splitext(pfile)[0]
+    all_results.to_csv(fbase + ".csv")
+    all_results.to_pickle(pfile)
+    try:
+        import tabular
+
+        with open(fbase + ".md", "wt") as f:
+            f.write(all_results.to_markdown())
+    except ImportError:
+        pass
+
 
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Benchmarking cuCIM transform functions')
+    func_name_choices = ['resize', 'resize_local_mean', 'rescale', 'rotate', 'downscale_local_mean', 'warp_polar', 'integral_image', 'pyramid_gaussian', 'pyramid_laplacian']
+    dtype_choices = ['float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64']
+    parser.add_argument('-i','--img_size', type=str, help='Size of input image (omit color channel, it will be appended as needed)', required=True)
+    parser.add_argument('-d','--dtype', type=str, help='Dtype of input image', choices = dtype_choices, required=True)
+    parser.add_argument('-f','--func_name', type=str, help='function to benchmark', choices = func_name_choices, required=True)
+    parser.add_argument('-t','--duration', type=int, help='time to run benchmark', required=True)
+    parser.add_argument('--no_cpu', action='store_true', help='disable cpu measurements', default=False)
 
-fbase = os.path.splitext(pfile)[0]
-all_results.to_csv(fbase + ".csv")
-all_results.to_pickle(pfile)
-with open(fbase + ".md", "wt") as f:
-    f.write(all_results.to_markdown())
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/skimage/cupyx_scipy_ndimage_filter_bench.py b/benchmarks/skimage/cupyx_scipy_ndimage_filter_bench.py
index 4c5143246..e09f3fe1d 100644
--- a/benchmarks/skimage/cupyx_scipy_ndimage_filter_bench.py
+++ b/benchmarks/skimage/cupyx_scipy_ndimage_filter_bench.py
@@ -102,7 +102,7 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
         )
         results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        all_results = pd.concat([all_results, results["full"]])
 
     for fname, wshape, var_kwargs in [
         ("convolve", weights_shape, dict(mode=modes)),
@@ -119,7 +119,7 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
         )
         results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        all_results = pd.concat([all_results, results["full"]])
 
 fbase = os.path.splitext(pfile)[0]
 all_results.to_csv(fbase + ".csv")
diff --git a/benchmarks/skimage/cupyx_scipy_ndimage_fourier_bench.py b/benchmarks/skimage/cupyx_scipy_ndimage_fourier_bench.py
index bfc6b448d..06eb19fc2 100644
--- a/benchmarks/skimage/cupyx_scipy_ndimage_fourier_bench.py
+++ b/benchmarks/skimage/cupyx_scipy_ndimage_fourier_bench.py
@@ -43,7 +43,7 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
         )
         results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        all_results = pd.concat([all_results, results["full"]])
 
 
 fbase = os.path.splitext(pfile)[0]
diff --git a/benchmarks/skimage/cupyx_scipy_ndimage_interp_bench.py b/benchmarks/skimage/cupyx_scipy_ndimage_interp_bench.py
index 294c33d5e..de518ab82 100644
--- a/benchmarks/skimage/cupyx_scipy_ndimage_interp_bench.py
+++ b/benchmarks/skimage/cupyx_scipy_ndimage_interp_bench.py
@@ -56,7 +56,7 @@ def set_args(self, dtype):
         var_kwargs=dict(mode=modes, order=orders),
     )
     results = B.run_benchmark(duration=1)
-    all_results = all_results.append(results["full"])
+    all_results = pd.concat([all_results, results["full"]])
 
     for fname, fixed_kwargs, var_kwargs in [
         (
@@ -139,7 +139,7 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
         )
         results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        all_results = pd.concat([all_results, results["full"]])
 
 fbase = os.path.splitext(pfile)[0]
 all_results.to_csv(fbase + ".csv")
diff --git a/benchmarks/skimage/cupyx_scipy_ndimage_measurements_bench.py b/benchmarks/skimage/cupyx_scipy_ndimage_measurements_bench.py
index 9f6e531af..bb1d479b1 100644
--- a/benchmarks/skimage/cupyx_scipy_ndimage_measurements_bench.py
+++ b/benchmarks/skimage/cupyx_scipy_ndimage_measurements_bench.py
@@ -145,7 +145,7 @@ def set_args(self, dtype):
                 var_kwargs=var_kwargs,
             )
             results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
+            all_results = pd.concat([all_results, results["full"]])
 
     for fname in [
         "sum",
@@ -180,7 +180,7 @@ def set_args(self, dtype):
                     var_kwargs=var_kwargs,
                 )
                 results = B.run_benchmark(duration=1)
-                all_results = all_results.append(results["full"])
+                all_results = pd.concat([all_results, results["full"]])
 
 fbase = os.path.splitext(pfile)[0]
 all_results.to_csv(fbase + ".csv")
diff --git a/benchmarks/skimage/cupyx_scipy_ndimage_morphology_bench.py b/benchmarks/skimage/cupyx_scipy_ndimage_morphology_bench.py
index 54d58d667..7fbe9e7c6 100644
--- a/benchmarks/skimage/cupyx_scipy_ndimage_morphology_bench.py
+++ b/benchmarks/skimage/cupyx_scipy_ndimage_morphology_bench.py
@@ -126,7 +126,7 @@ def set_args(self, dtype):
             var_kwargs=var_kwargs,
         )
         results = B.run_benchmark(duration=1)
-        all_results = all_results.append(results["full"])
+        all_results = pd.concat([all_results, results["full"]])
 
     iterations = [1, 10, 30]
     for fname, var_kwargs in [
@@ -152,7 +152,7 @@ def set_args(self, dtype):
                 var_kwargs=var_kwargs,
             )
             results = B.run_benchmark(duration=1)
-            all_results = all_results.append(results["full"])
+            all_results = pd.concat([all_results, results["full"]])
 
 fbase = os.path.splitext(pfile)[0]
 all_results.to_csv(fbase + ".csv")
diff --git a/benchmarks/skimage/requirements-bench.txt b/benchmarks/skimage/requirements-bench.txt
new file mode 100644
index 000000000..2cc6ac30e
--- /dev/null
+++ b/benchmarks/skimage/requirements-bench.txt
@@ -0,0 +1,2 @@
+pandas>=1.0
+tabulate>=0.8.7
diff --git a/benchmarks/skimage/run-nv-bench-color.sh b/benchmarks/skimage/run-nv-bench-color.sh
new file mode 100755
index 000000000..14a0809ec
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-color.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 192,192,192)
+param_filt=(convert_colorspace rgb2hed hed2rgb lab2lch lch2lab xyz2lab lab2xyz rgba2rgb label2rgb)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_color_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-exposure.sh b/benchmarks/skimage/run-nv-bench-exposure.sh
new file mode 100755
index 000000000..4ed9e0748
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-exposure.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(equalize_adapthist cumulative_distribution equalize_hist rescale_intensity adjust_gamma adjust_log adjust_sigmoid is_low_contrast match_histograms)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_exposure_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-feature.sh b/benchmarks/skimage/run-nv-bench-feature.sh
new file mode 100755
index 000000000..fbab913c6
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-feature.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(multiscale_basic_features canny daisy structure_tensor hessian_matrix hessian_matrix_det shape_index corner_kitchen_rosenfeld corner_harris corner_shi_tomasi corner_foerstner corner_peaks match_template)
+param_dt=(float64 float32 float16)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_feature_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-filters.sh b/benchmarks/skimage/run-nv-bench-filters.sh
new file mode 100755
index 000000000..37388e9e9
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-filters.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(gabor gaussian median rank_order unsharp_mask sobel prewitt scharr roberts roberts_pos_diag roberts_neg_diag farid laplace meijering sato frangi hessian threshold_isodata threshold_otsu threshold_yen threshold_local threshold_li threshold_minimum threshold_mean threshold_triangle threshold_niblack threshold_sauvola apply_hysteresis_threshold threshold_multiotsu)
+# param_filt=(rank_order )
+param_dt=(float64 float32 float16)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_filters_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-measure.sh b/benchmarks/skimage/run-nv-bench-measure.sh
new file mode 100755
index 000000000..5c2b4f413
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-measure.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(label regionprops moments moments_central centroid inertia_tensor inertia_tensor_eigvals block_reduce shannon_entropy profile_line)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_measure_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-metrics.sh b/benchmarks/skimage/run-nv-bench-metrics.sh
new file mode 100755
index 000000000..b24ba775b
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-metrics.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(structural_similarity mean_squared_error normalized_root_mse peak_signal_noise_ratio normalized_mutual_information)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_metrics_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-morphology.sh b/benchmarks/skimage/run-nv-bench-morphology.sh
new file mode 100755
index 000000000..4981a58ad
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-morphology.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(binary_erosion binary_dilation binary_opening binary_closing remove_small_objects remove_small_holes erosion dilation opening closing white_tophat black_tophat thin reconstruction)
+# Note: user-specified dtype ignored for binary_* functions and thin (these only accept binary input)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_morphology_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-registration.sh b/benchmarks/skimage/run-nv-bench-registration.sh
new file mode 100755
index 000000000..18f572b53
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-registration.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(phase_cross_correlation optical_flow_tvl1 optical_flow_ilk)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_registration_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-restoration.sh b/benchmarks/skimage/run-nv-bench-restoration.sh
new file mode 100755
index 000000000..df7c9141f
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-restoration.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(denoise_tv_chambolle calibrate_denoiser wiener unsupervised_wiener richardson_lucy)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_restoration_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-segmentation.sh b/benchmarks/skimage/run-nv-bench-segmentation.sh
new file mode 100755
index 000000000..4723f0c60
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-segmentation.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+
+# these require an integer-valued label image
+param_filt=(clear_border relabel_sequential find_boundaries mark_boundaries random_walker)
+param_dt=(float32)
+param_dt_label=(uint8 uint32)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            for dt_label in "${param_dt_label[@]}"; do
+                python cucim_segmentation_bench.py -f $filt -i $shape -d $dt --dtype_label $dt_label -t 10
+            done
+        done
+    done
+done
+
+# these do not require an integer-valued input image
+param_filt=(inverse_gaussian_gradient morphological_geodesic_active_contour morphological_chan_vese)
+param_dt=(float32)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_segmentation_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/benchmarks/skimage/run-nv-bench-transform.sh b/benchmarks/skimage/run-nv-bench-transform.sh
new file mode 100755
index 000000000..c880cb699
--- /dev/null
+++ b/benchmarks/skimage/run-nv-bench-transform.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+param_shape=(512,512 3840,2160 3840,2160,3 192,192,192)
+param_filt=(resize resize_local_mean rescale rotate downscale_local_mean warp_polar integral_image pyramid_gaussian pyramid_laplacian)
+param_dt=(float32 uint8)
+for shape in "${param_shape[@]}"; do
+    for filt in "${param_filt[@]}"; do
+        for dt in "${param_dt[@]}"; do
+            python cucim_transform_bench.py -f $filt -i $shape -d $dt -t 10
+        done
+    done
+done
diff --git a/conda/environments/env.yml b/conda/environments/env.yml
index 895329fa6..0cee99491 100644
--- a/conda/environments/env.yml
+++ b/conda/environments/env.yml
@@ -3,7 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - cupy>=9
-  - scikit-image>=0.18.1
+  - scikit-image>=0.19.0
   - openslide
   - zlib
   - jpeg
diff --git a/conda/recipes/cucim/build.sh b/conda/recipes/cucim/build.sh
index 77e9b7aef..28b1112c2 100644
--- a/conda/recipes/cucim/build.sh
+++ b/conda/recipes/cucim/build.sh
@@ -4,14 +4,6 @@ CUCIM_BUILD_TYPE=${CUCIM_BUILD_TYPE:-release}
 
 echo "CC          : ${CC}"
 echo "CXX         : ${CXX}"
-echo "CUDAHOSTCXX : ${CUDAHOSTCXX}"
-
-# For now CUDAHOSTCXX is set to `/usr/bin/g++` by
-# https://github.com/rapidsai/docker/blob/161b200157206660d88fb02cf69fe58d363ac95e/generated-dockerfiles/rapidsai-core_ubuntu18.04-devel.Dockerfile
-# To use GCC-9 in conda build environment, need to set it to $CXX (=$BUILD_PREFIX/bin/x86_64-conda-linux-gnu-c++)
-# This can be removed once we switch to use gcc-9
-# : https://docs.rapids.ai/notices/rdn0002/
-export CUDAHOSTCXX=${CXX}
 
 # CUDA needs to include $PREFIX/include as system include path
 export CUDAFLAGS="-isystem $BUILD_PREFIX/include -isystem $PREFIX/include "
diff --git a/conda/recipes/cucim/meta.yaml b/conda/recipes/cucim/meta.yaml
index fa9377b93..479d17e83 100644
--- a/conda/recipes/cucim/meta.yaml
+++ b/conda/recipes/cucim/meta.yaml
@@ -32,7 +32,7 @@ requirements:
     - cupy >=9,<11.0.0a0
     - numpy 1.19
     - scipy
-    - scikit-image >=0.18.1,<0.20.0a0
+    - scikit-image >=0.19.0,<0.20.0a0
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - python {{ python_version }}.*
@@ -41,7 +41,7 @@ requirements:
     - cupy >=9,<11.0.0a0
     - {{ pin_compatible('numpy') }}
     - scipy
-    - scikit-image >=0.18.1,<0.20.0a0
+    - scikit-image >=0.19.0,<0.20.0a0
     # - openslide # skipping here but benchmark binary would needs openslide library
 
 tests:                                 # [linux64]
diff --git a/conda/recipes/libcucim/build.sh b/conda/recipes/libcucim/build.sh
index 9d2217bbc..7d21aebe3 100644
--- a/conda/recipes/libcucim/build.sh
+++ b/conda/recipes/libcucim/build.sh
@@ -4,16 +4,8 @@ CUCIM_BUILD_TYPE=${CUCIM_BUILD_TYPE:-release}
 
 echo "CC          : ${CC}"
 echo "CXX         : ${CXX}"
-echo "CUDAHOSTCXX : ${CUDAHOSTCXX}"
 echo "CUDA        : ${CUDA}"
 
-# For now CUDAHOSTCXX is set to `/usr/bin/g++` by
-# https://github.com/rapidsai/docker/blob/161b200157206660d88fb02cf69fe58d363ac95e/generated-dockerfiles/rapidsai-core_ubuntu18.04-devel.Dockerfile
-# To use GCC-9 in conda build environment, need to set it to $CXX (=$BUILD_PREFIX/bin/x86_64-conda-linux-gnu-c++)
-# This can be removed once we switch to use gcc-9
-# : https://docs.rapids.ai/notices/rdn0002/
-export CUDAHOSTCXX=${CXX}
-
 # CUDA needs to include $PREFIX/include as system include path
 export CUDAFLAGS="-isystem $BUILD_PREFIX/include -isystem $PREFIX/include "
 export LD_LIBRARY_PATH="$BUILD_PREFIX/lib:$PREFIX/lib:$LD_LIBRARY_PATH"
diff --git a/cpp/plugins/cucim.kit.cumed/VERSION b/cpp/plugins/cucim.kit.cumed/VERSION
index ddc883c42..231fac64b 100644
--- a/cpp/plugins/cucim.kit.cumed/VERSION
+++ b/cpp/plugins/cucim.kit.cumed/VERSION
@@ -1 +1 @@
-22.06.00
+22.08.00
diff --git a/cpp/plugins/cucim.kit.cuslide/VERSION b/cpp/plugins/cucim.kit.cuslide/VERSION
index ddc883c42..231fac64b 100644
--- a/cpp/plugins/cucim.kit.cuslide/VERSION
+++ b/cpp/plugins/cucim.kit.cuslide/VERSION
@@ -1 +1 @@
-22.06.00
+22.08.00
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
index b284e0f71..0b7cbeb21 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/cuslide.cpp
@@ -171,16 +171,46 @@ static bool CUCIM_ABI parser_parse(CuCIMFileHandle_ptr handle_ptr, cucim::io::fo
         channel_names.emplace_back(std::string_view{ "A" });
     }
 
-    // TODO: Set correct spacing value
+    // Spacing units
+    std::pmr::vector<std::string_view> spacing_units(&resource);
+    spacing_units.reserve(ndim);
+
     std::pmr::vector<float> spacing(&resource);
     spacing.reserve(ndim);
-    spacing.insert(spacing.end(), ndim, 1.0);
+    const auto resolution_unit = level0_ifd->resolution_unit();
+    const auto x_resolution = level0_ifd->x_resolution();
+    const auto y_resolution = level0_ifd->y_resolution();
+
+    switch (resolution_unit)
+    {
+    case 1: // no absolute unit of measurement
+        spacing.emplace_back(y_resolution);
+        spacing.emplace_back(x_resolution);
+        spacing.emplace_back(1.0f);
+
+        spacing_units.emplace_back(std::string_view{ "" });
+        spacing_units.emplace_back(std::string_view{ "" });
+        break;
+    case 2: // inch
+        spacing.emplace_back(y_resolution != 0 ? 25400 / y_resolution : 1.0f);
+        spacing.emplace_back(x_resolution != 0 ? 25400 / x_resolution : 1.0f);
+        spacing.emplace_back(1.0f);
+
+        spacing_units.emplace_back(std::string_view{ "micrometer" });
+        spacing_units.emplace_back(std::string_view{ "micrometer" });
+        break;
+    case 3: // centimeter
+        spacing.emplace_back(y_resolution != 0 ? 10000 / y_resolution : 1.0f);
+        spacing.emplace_back(x_resolution != 0 ? 10000 / x_resolution : 1.0f);
+        spacing.emplace_back(1.0f);
+
+        spacing_units.emplace_back(std::string_view{ "micrometer" });
+        spacing_units.emplace_back(std::string_view{ "micrometer" });
+        break;
+    default:
+        spacing.insert(spacing.end(), ndim, 1.0f);
+    }
 
-    // TODO: Set correct spacing units
-    std::pmr::vector<std::string_view> spacing_units(&resource);
-    spacing_units.reserve(ndim);
-    spacing_units.emplace_back(std::string_view{ "micrometer" });
-    spacing_units.emplace_back(std::string_view{ "micrometer" });
     spacing_units.emplace_back(std::string_view{ "color" });
 
     std::pmr::vector<float> origin({ 0.0, 0.0, 0.0 }, &resource);
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
index 52a66cd80..9bcacb25d 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.cpp
@@ -63,6 +63,9 @@ IFD::IFD(TIFF* tiff, uint16_t index, ifd_offset_t offset) : tiff_(tiff), ifd_ind
     model_ = std::string(model_char_ptr ? model_char_ptr : "");
     TIFFGetField(tif, TIFFTAG_IMAGEDESCRIPTION, &model_char_ptr);
     image_description_ = std::string(model_char_ptr ? model_char_ptr : "");
+    TIFFGetField(tif, TIFFTAG_RESOLUTIONUNIT, &resolution_unit_);
+    TIFFGetField(tif, TIFFTAG_XRESOLUTION, &x_resolution_);
+    TIFFGetField(tif, TIFFTAG_YRESOLUTION, &y_resolution_);
 
     TIFFDirectory& tif_dir = tif->tif_dir;
     flags_ = tif->tif_flags;
@@ -451,6 +454,18 @@ std::string& IFD::image_description()
 {
     return image_description_;
 }
+uint16_t IFD::resolution_unit() const
+{
+    return resolution_unit_;
+}
+float IFD::x_resolution() const
+{
+    return x_resolution_;
+}
+float IFD::y_resolution() const
+{
+    return y_resolution_;
+}
 uint32_t IFD::width() const
 {
     return width_;
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
index e15724782..5737d82d5 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/ifd.h
@@ -72,6 +72,9 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this<IFD>
     std::string& software();
     std::string& model();
     std::string& image_description();
+    uint16_t resolution_unit() const;
+    float x_resolution() const;
+    float y_resolution() const;
     uint32_t width() const;
     uint32_t height() const;
     uint32_t tile_width() const;
@@ -109,6 +112,10 @@ class EXPORT_VISIBLE IFD : public std::enable_shared_from_this<IFD>
     std::string software_;
     std::string model_;
     std::string image_description_;
+    uint16_t resolution_unit_ = 1; // 1 = No absolute unit of measurement, 2 = Inch, 3 = Centimeter
+    float x_resolution_ = 1.0f;
+    float y_resolution_ = 1.0f;
+
     uint32_t flags_ = 0;
     uint32_t width_ = 0;
     uint32_t height_ = 0;
diff --git a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
index cf9d38e3e..1f1ac6ab6 100644
--- a/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
+++ b/cpp/plugins/cucim.kit.cuslide/src/cuslide/tiff/tiff.cpp
@@ -387,6 +387,9 @@ void TIFF::resolve_vendor_format()
     auto& first_ifd = ifds_[0];
     std::string& model = first_ifd->model();
     std::string& software = first_ifd->software();
+    const uint16_t resolution_unit = first_ifd->resolution_unit();
+    const float x_resolution = first_ifd->x_resolution();
+    const float y_resolution = first_ifd->y_resolution();
 
     // Detect Aperio SVS format
     {
@@ -416,6 +419,20 @@ void TIFF::resolve_vendor_format()
 
         tiff_metadata.emplace("model", model);
         tiff_metadata.emplace("software", software);
+        switch (resolution_unit)
+        {
+        case 2:
+            tiff_metadata.emplace("resolution_unit", "inch");
+            break;
+        case 3:
+            tiff_metadata.emplace("resolution_unit", "centimeter");
+            break;
+        default:
+            tiff_metadata.emplace("resolution_unit", "");
+            break;
+        }
+        tiff_metadata.emplace("x_resolution", x_resolution);
+        tiff_metadata.emplace("y_resolution", y_resolution);
 
         (*json_metadata).emplace("tiff", std::move(tiff_metadata));
     }
diff --git a/cpp/src/cuimage.cpp b/cpp/src/cuimage.cpp
index 3ba8fc18b..d9c0ee4dc 100644
--- a/cpp/src/cuimage.cpp
+++ b/cpp/src/cuimage.cpp
@@ -680,10 +680,11 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
         location.emplace_back(0);
         location.emplace_back(0);
     }
+
+    const ResolutionInfo& res_info = resolutions();
     // If `size` is not specified, size would be (width, height) of the image at the specified `level`.
     if (size.empty())
     {
-        const ResolutionInfo& res_info = resolutions();
         const auto level_count = res_info.level_count();
         if (level_count == 0)
         {
@@ -853,19 +854,36 @@ CuImage CuImage::read_region(std::vector<int64_t>&& location,
         // The first dimension is for 'batch' ('N')
         spacing_units.emplace_back(std::string_view{ "batch" });
     }
+    const auto& level_downsample = res_info.level_downsample(level);
     for (; index < ndim; ++index)
     {
-        int64_t dim_char = dim_indices_.index(dims[index]);
+        int64_t dim_index = dim_indices_.index(dims[index]);
+        if (dim_index < 0)
+        {
+            throw std::runtime_error(fmt::format("[Error] Invalid dimension name: {}", dims[index]));
+        }
 
-        const char* str_ptr = image_metadata_->spacing_units[dim_char];
-        size_t str_len = strlen(image_metadata_->spacing_units[dim_char]);
+        const char* str_ptr = image_metadata_->spacing_units[dim_index];
+        size_t str_len = strlen(image_metadata_->spacing_units[dim_index]);
 
         char* spacing_unit = static_cast<char*>(resource.allocate(str_len + 1));
         memcpy(spacing_unit, str_ptr, str_len);
         spacing_unit[str_len] = '\0';
-        // std::pmr::string spacing_unit{ image_metadata_->spacing_units[dim_char], &resource };
+        // std::pmr::string spacing_unit{ image_metadata_->spacing_units[dim_index], &resource };
 
         spacing_units.emplace_back(std::string_view{ spacing_unit });
+
+        // Update spacing based on level_downsample
+        char dim_char = image_metadata_->dims[dim_index];
+        switch (dim_char)
+        {
+        case 'X':
+        case 'Y':
+            spacing[index] /= level_downsample;
+            break;
+        default:
+            break;
+        }
     }
 
     std::pmr::vector<float> origin(&resource);
diff --git a/cucim.code-workspace b/cucim.code-workspace
index ac73eea68..a73035b60 100644
--- a/cucim.code-workspace
+++ b/cucim.code-workspace
@@ -33,7 +33,7 @@
 					"CUCIM_TESTDATA_FOLDER": "${workspaceDirectory}/test_data",
 					// Add cuslide plugin's library path to LD_LIBRARY_PATH
 					"LD_LIBRARY_PATH": "${workspaceDirectory}/build-debug/lib:${workspaceDirectory}/cpp/plugins/cucim.kit.cuslide/build-debug/lib:${workspaceDirectory}/temp/cuda/lib64:${os_env:LD_LIBRARY_PATH}",
-					"CUCIM_TEST_PLUGIN_PATH": "cucim.kit.cuslide@22.06.00.so"
+					"CUCIM_TEST_PLUGIN_PATH": "cucim.kit.cuslide@22.08.00.so"
 				},
 				"cwd": "${workspaceDirectory}",
 				"catch2": {
@@ -226,7 +226,7 @@
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
-						"value": "cucim.kit.cuslide@22.06.00.so"
+						"value": "cucim.kit.cuslide@22.08.00.so"
 					}
 				],
 				"console": "externalTerminal",
@@ -254,7 +254,7 @@
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
-						"value": "cucim.kit.cuslide@22.06.00.so"
+						"value": "cucim.kit.cuslide@22.08.00.so"
 					}
 				],
 				"console": "externalTerminal",
@@ -286,7 +286,7 @@
 					},
 					{
 						"name": "CUCIM_TEST_PLUGIN_PATH",
-						"value": "cucim.kit.cuslide@22.06.00.so"
+						"value": "cucim.kit.cuslide@22.08.00.so"
 					}
 				],
 				"console": "externalTerminal",
diff --git a/docs/Makefile b/docs/Makefile
index aeb3540aa..2f6cf8a2b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -4,7 +4,7 @@
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = cuImage
+SPHINXPROJ    = cuCIM
 SOURCEDIR     = source
 BUILDDIR      = build
 
diff --git a/docs/source/_static/params.css b/docs/source/_static/params.css
deleted file mode 100644
index c080d3669..000000000
--- a/docs/source/_static/params.css
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Mirrors the change in:
- *   https://github.com/sphinx-doc/sphinx/pull/5976
- * which is not showing up in our theme.
- */
-.classifier:before {
-    font-style: normal;
-    margin: 0.5em;
-    content: ":";
-}
-
-:root {
-
-  --pst-color-active-navigation: 114, 83, 237;
-  --pst-color-navbar-link: 77, 77, 77;
-  --pst-color-navbar-link-hover: var(--pst-color-active-navigation);
-  --pst-color-navbar-link-active: var(--pst-color-active-navigation);
-  --pst-color-sidebar-link: 77, 77, 77;
-  --pst-color-sidebar-link-hover: var(--pst-color-active-navigation);
-  --pst-color-sidebar-link-active: var(--pst-color-active-navigation);
-  --pst-color-sidebar-expander-background-hover: 244, 244, 244;
-  --pst-color-sidebar-caption: 77, 77, 77;
-  --pst-color-toc-link: 119, 117, 122;
-  --pst-color-toc-link-hover: var(--pst-color-active-navigation);
-  --pst-color-toc-link-active: var(--pst-color-active-navigation);
-
-}
-
-.special-table td, .special-table th {
-    border: 1px solid #dee2e6;
-}
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 04cfda78c..059886c7e 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -55,6 +55,13 @@ intensity
     :members:
     :undoc-members:
 
+morphology
+----------
+
+.. automodule:: cucim.core.operations.morphology
+    :members:
+    :undoc-members:
+
 spatial
 -------
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index dfad46b01..e3957382d 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -39,6 +39,7 @@
     'sphinx.ext.intersphinx',
     'sphinx.ext.autodoc',
     'sphinx.ext.autosummary',
+    'sphinx.ext.mathjax',
     'numpydoc',
     'doi_role',
     'IPython.sphinxext.ipython_console_highlighting',
@@ -71,9 +72,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '22.06'
+version = '22.08'
 # The full version, including alpha/beta/rc tags.
-release = '22.06.00'
+release = '22.08.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -172,8 +173,14 @@
 ]
 
 
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {'https://docs.python.org/': None}
+# Configuration for intersphinx: refer to other projects documentation.
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/', None),
+    'cupy': ('https://docs.cupy.dev/en/stable/', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'scipy': ('https://docs.scipy.org/doc/scipy/', None),
+    'skimage': ('https://scikit-image.org/docs/stable/', None),
+}
 
 
 # Config numpydoc
@@ -182,5 +189,5 @@
 
 
 def setup(app):
-    app.add_css_file('params.css')
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
+    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5c8dd9fcc..1fd04ad19 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,18 +1,81 @@
 
 Welcome to cuCIM's documentation!
 ====================================
-cuCIM is a an extensible toolkit designed to provide GPU-accelearted I/O,
-computer vision and image processing primitives for N-Dimensional images
-with a focus on biomedical imaging.  Our API mirrors `scikit-image
+cuCIM (Compute Unified Device Architecture Clara IMage) is an open-source, accelerated computer vision and image processing software library for multidimensional images used in biomedical, geospatial, material and life science, and remote sensing use cases.
+
+cuCIM provides GPU-accelearted I/O,
+computer vision and image processing primitives for N-Dimensional images including:
+
+- color conversion
+- exposure
+- feature extraction
+- filters
+- measure
+- metrics
+- morphology
+- registration
+- restoration
+- segmentation
+- transforms
+
+cuCIM supports the following formats:
+
+-  Aperio ScanScope Virtual Slide (SVS)
+-  Philips TIFF
+-  Generic Tiled, Multi-resolution RGB TIFF files with the following
+   compression schemes:
+
+   -  No Compression
+   -  JPEG
+   -  JPEG2000
+   -  Lempel-Ziv-Welch (LZW)
+   -  Deflate
+
+Our API mirrors `scikit-image
 <https://scikit-image.org/>`_ for image manipulation and `OpenSlide
 <https://openslide.org/>`_ for image loading.
 
-cuCIM is fully open sourced under the Apache-2.0 license, and the Clara and RAPIDS teams welcomes new and seasoned
-contributors, users and hobbyists! Thank you for your wonderful support!
+cuCIM is interoperable with the following workflows:
+
+- Albumentations
+- cuPY
+- Data Loading Library (DALI)
+- JFX
+- MONAI
+- Numba
+- NumPy
+- PyTorch
+- Tensorflow
+- Triton
+
+cuCIM is fully open sourced under the Apache-2.0 license, and the Clara
+and RAPIDS teams welcomes new and seasoned contributors, users and
+hobbyists! You may download cuCIM via Anaconda `Conda`_ or `PyPI`_ Thank
+you for your wonderful support! Below, we provide some resources to help
+get you started.
+
+**Blogs**
+
+-  `Accelerating Scikit-Image API with cuCIM: n-Dimensional Image
+   Processing and IO on GPUs`_
+-  `Accelerating Digital Pathology Pipelines with NVIDIA Clara™ Deploy`_
+
+**Webinars**
+
+-  `cuCIM: a GPU Image IO and Processing Library`_
+
+.. _Conda: https://anaconda.org/rapidsai-nightly/cucim
+.. _PyPi: https://pypi.org/project/cucim/
+.. _`Accelerating Scikit-Image API with cuCIM: n-Dimensional Image Processing and IO on GPUs`: https://developer.nvidia.com/blog/cucim-rapid-n-dimensional-image-processing-and-i-o-on-gpus/
+.. _Accelerating Digital Pathology Pipelines with NVIDIA Clara™ Deploy: https://developer.nvidia.com/blog/accelerating-digital-pathology-pipelines-with-nvidia-clara-deploy-2/
+.. _`cuCIM: a GPU Image IO and Processing Library`: https://www.youtube.com/watch?v=G46kOOM9xbQ
+
+
+Contents
+==================
 
 .. toctree::
    :maxdepth: 4
-   :caption: Contents:
 
    api.rst
 
diff --git a/python/cucim/CHANGELOG.md b/python/cucim/CHANGELOG.md
index d29d31b41..81d223e82 100644
--- a/python/cucim/CHANGELOG.md
+++ b/python/cucim/CHANGELOG.md
@@ -1,6 +1,30 @@
 
 # Changelog (See [Release Notes](https://github.com/rapidsai/cucim/wiki/Release-Notes))
 
+## [22.02.06](https://github.com/rapidsai/cucim/wiki/release_notes_v22.06.00)
+
+- [Update/Breaking] Promote small integer types to single rather than double precision ([#278](https://github.com/rapidsai/cucim/pull/278)) [@grlee77](https://github.com/grlee77)
+- [Bug] Populate correct channel names for RGBA image ([#294](https://github.com/rapidsai/cucim/pull/294)) [@gigony](https://github.com/gigony)
+- [Bug] Merge branch-22.04 into branch-22.06 ([#258](https://github.com/rapidsai/cucim/pull/258)) [@jakirkham](https://github.com/jakirkham)
+- [New] add missing `cucim.skimage.segmentation.clear_border` function ([#267](https://github.com/rapidsai/cucim/pull/267)) [@grlee77](https://github.com/grlee77)
+- [New] add `cucim.core.operations.color.stain_extraction_pca` and `cucim.core.operations.color.normalize_colors_pca` for digital pathology H&E stain extraction and normalization ([#273](https://github.com/rapidsai/cucim/pull/273)) [@grlee77](https://github.com/grlee77), [@drbeh](https://github.com/drbeh)
+- [Update] Update to use DLPack v0.6 ([#295](https://github.com/rapidsai/cucim/pull/295)) [@gigony](https://github.com/gigony)
+- [Update] Remove plugin-related messages temporarily ([#291](https://github.com/rapidsai/cucim/pull/291)) [@gigony](https://github.com/gigony)
+- [Update] Simplify recipes ([#286](https://github.com/rapidsai/cucim/pull/286)) [@Ethyling](https://github.com/Ethyling)
+- [Update] Use cupy.fuse to improve efficiency hessian_matrix_eigvals ([#280](https://github.com/rapidsai/cucim/pull/280)) [@grlee77](https://github.com/grlee77)
+- [Update] improve efficiency of histogram-based thresholding functions ([#276](https://github.com/rapidsai/cucim/pull/276)) [@grlee77](https://github.com/grlee77)
+- [Update] Remove unused dependencies in GPU tests job ([#268](https://github.com/rapidsai/cucim/pull/268)) [@Ethyling](https://github.com/Ethyling)
+- [Update] Enable footprint decomposition for morphology ([#274](https://github.com/rapidsai/cucim/pull/274)) [@grlee77](https://github.com/grlee77)
+- [Update] Use conda compilers ([#232](https://github.com/rapidsai/cucim/pull/232)) [@Ethyling](https://github.com/Ethyling)
+- [Update] Build packages using mambabuild ([#216](https://github.com/rapidsai/cucim/pull/216)) [@Ethyling](https://github.com/Ethyling)
+- [Doc] update outdated links to example data ([#289](https://github.com/rapidsai/cucim/pull/289)) [@grlee77](https://github.com/grlee77)
+- [Doc] Add missing API docs ([#275](https://github.com/rapidsai/cucim/pull/275)) [@grlee77](https://github.com/grlee77)
+
+## [22.02.04](https://github.com/rapidsai/cucim/wiki/release_notes_v22.04.00)
+
+- [Bug] Fix ImportError from vendored code ([#252](https://github.com/rapidsai/cucim/pull/252)) [@grlee77](https://github.com/grlee77)
+- [Bug] Fix wrong dimension in metadata ([#248](https://github.com/rapidsai/cucim/pull/248)) [@gigony](https://github.com/gigony)
+
 ## [22.02.01](https://github.com/rapidsai/cucim/wiki/release_notes_v22.02.01)
 
 - [Bug] Check nullptr of handler in CuFileDriver::close() ([#229](https://github.com/rapidsai/cucim/pull/229)) [@gigony](https://github.com/gigony)
diff --git a/python/cucim/README.md b/python/cucim/README.md
index 94556f23c..8f1413137 100644
--- a/python/cucim/README.md
+++ b/python/cucim/README.md
@@ -6,10 +6,13 @@
 
 **NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cucim/blob/main/README.md) ensure you are on the `main` branch.
 
-- [GTC 2021 cuCIM: A GPU Image I/O and Processing Toolkit [S32194]](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=cucim&sort=date)
-  - [video](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s32194/)
+- [GTC 2022 Accelerating Storage IO to GPUs with Magnum IO [S41347]](https://events.rainfocus.com/widget/nvidia/gtcspring2022/sessioncatalog/session/1634960000577001Etxp)
+  - cuCIM's GDS API examples: <https://github.com/NVIDIA/MagnumIO/tree/main/gds/readers/cucim-gds>
 - [SciPy 2021 cuCIM - A GPU image I/O and processing library](https://www.scipy2021.scipy.org/)
   - [video](https://youtu.be/G46kOOM9xbQ)
+- [GTC 2021 cuCIM: A GPU Image I/O and Processing Toolkit [S32194]](https://www.nvidia.com/en-us/on-demand/search/?facet.mimetype[]=event%20session&layout=list&page=1&q=cucim&sort=date)
+  - [video](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s32194/)
+
 
 ## Quick Start
 
@@ -24,7 +27,7 @@ pip install scipy scikit-image cupy-cuda110
 
 ### Jupyter Notebooks
 
-Please check out our [Welcome](https://github.com/rapidsai/cucim/blob/branch-22.06/notebooks/Welcome.ipynb) notebook.
+Please check out our [Welcome](https://github.com/rapidsai/cucim/blob/branch-22.08/notebooks/Welcome.ipynb) notebook.
 
 ### Open Image
 
@@ -97,15 +100,15 @@ visualize(region)
 
 Aperio SVS (.svs) image format is supported since cuCIM [v21.10.01](https://github.com/rapidsai/cucim/wiki/release_notes_v21.10.01#support-aperio-svs-svs).
 
-Please check [this notebook](https://nbviewer.org/github/rapidsai/cucim/blob/branch-22.06/notebooks/Supporting_Aperio_SVS_Format.ipynb) to see how to use the feature.
+Please check [this notebook](https://nbviewer.org/github/rapidsai/cucim/blob/branch-22.08/notebooks/Supporting_Aperio_SVS_Format.ipynb) to see how to use the feature.
 
 ### Using Cache
 
-Please look at this [notebook](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.06/notebooks/Using_Cache.ipynb).
+Please look at this [notebook](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.08/notebooks/Using_Cache.ipynb).
 
 ### Accessing File with GDS
 
-Please look at this [notebook](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.06/notebooks/Accessing_File_with_GDS.ipynb).
+Please look at this [notebook](https://nbviewer.jupyter.org/github/rapidsai/cucim/blob/branch-22.08/notebooks/Accessing_File_with_GDS.ipynb).
 
 ### NVTX Support for Performance Analysis
 
@@ -178,4 +181,4 @@ is used in this project.
 
 Apache-2.0 License (see `LICENSE` file).
 
-Copyright (c) 2020-2021, NVIDIA CORPORATION.
+Copyright (c) 2020-2022, NVIDIA CORPORATION.
diff --git a/python/cucim/VERSION b/python/cucim/VERSION
index ddc883c42..231fac64b 100644
--- a/python/cucim/VERSION
+++ b/python/cucim/VERSION
@@ -1 +1 @@
-22.06.00
+22.08.00
diff --git a/python/cucim/docs/getting_started/index.md b/python/cucim/docs/getting_started/index.md
index 200c9fc3b..cede687cf 100644
--- a/python/cucim/docs/getting_started/index.md
+++ b/python/cucim/docs/getting_started/index.md
@@ -14,15 +14,15 @@
 
 ## Installation
 
-Please download the latest SDK package (`cuCIM-v22.06.00-linux.tar.gz`).
+Please download the latest SDK package (`cuCIM-v22.08.00-linux.tar.gz`).
 
 Untar the downloaded file.
 
 ```bash
-mkdir -p cuCIM-v22.06.00
-tar -xzvf cuCIM-v22.06.00-linux.tar.gz -C cuCIM-v22.06.00
+mkdir -p cuCIM-v22.08.00
+tar -xzvf cuCIM-v22.08.00-linux.tar.gz -C cuCIM-v22.08.00
 
-cd cuCIM-v22.06.00
+cd cuCIM-v22.08.00
 ```
 
 ## Run command
@@ -147,7 +147,7 @@ Its execution would show some metadata information and create two files -- `outp
 ```
 $ ./bin/tiff_image notebooks/input/image.tif .
 [Plugin: cucim.kit.cuslide] Loading...
-[Plugin: cucim.kit.cuslide] Loading the dynamic library from: cucim.kit.cuslide@22.06.00.so
+[Plugin: cucim.kit.cuslide] Loading the dynamic library from: cucim.kit.cuslide@22.08.00.so
 [Plugin: cucim.kit.cuslide] loaded successfully. Version: 0
 Initializing plugin: cucim.kit.cuslide (interfaces: [cucim::io::IImageFormat v0.1]) (impl: cucim.kit.cuslide)
 is_loaded: true
diff --git a/python/cucim/docs/index.md b/python/cucim/docs/index.md
index 4f03922c3..e9c4df2a4 100644
--- a/python/cucim/docs/index.md
+++ b/python/cucim/docs/index.md
@@ -18,7 +18,7 @@ development/index -->
 
 # cuCIM Documentation
 
-Current latest version is [Version 22.06.00](release_notes/v22.06.00.md).
+Current latest version is [Version 22.08.00](release_notes/v22.08.00.md).
 
 **cuCIM** a toolkit to provide GPU accelerated I/O, image  processing & computer vision primitives for N-Dimensional images with a focus on biomedical imaging.
 
diff --git a/python/cucim/requirements-test.txt b/python/cucim/requirements-test.txt
index ffdcf7092..6d62d5a29 100644
--- a/python/cucim/requirements-test.txt
+++ b/python/cucim/requirements-test.txt
@@ -1,8 +1,8 @@
-GPUtil==1.4.0
-imagecodecs==2021.6.8
-openslide-python==1.1.2
-psutil==5.8.0
-pytest==6.2.4
-pytest-cov==2.12.1
-pytest-lazy-fixture==0.6.3
-tifffile==2021.7.2
+GPUtil>=1.4.0
+imagecodecs>=2021.6.8
+openslide-python>=1.1.2
+psutil>=5.8.0
+pytest>=6.2.4
+pytest-cov>=2.12.1
+pytest-lazy-fixture>=0.6.3
+tifffile>=2022.7.28
diff --git a/python/cucim/setup.cfg b/python/cucim/setup.cfg
index 517f33fff..7743540c2 100644
--- a/python/cucim/setup.cfg
+++ b/python/cucim/setup.cfg
@@ -65,6 +65,6 @@ line_length = 80
 known_first_party = cucim
 default_section = THIRDPARTY
 forced_separate = test_cucim
-skip = .tox,.eggs,ci/templates,build,dist,versioneer.py
+skip = .tox,.eggs,ci/templates,build,dist,versioneer.py,ndimage.py
 sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
 multi_line_output = GRID
diff --git a/python/cucim/src/cucim/core/__init__.py b/python/cucim/src/cucim/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/cucim/src/cucim/core/operations/color/stain_normalizer.py b/python/cucim/src/cucim/core/operations/color/stain_normalizer.py
index f5141ede2..3e82b26b8 100644
--- a/python/cucim/src/cucim/core/operations/color/stain_normalizer.py
+++ b/python/cucim/src/cucim/core/operations/color/stain_normalizer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import math
-import warnings
 from typing import Union
 
 import cupy as cp
@@ -56,12 +55,9 @@ def image_to_absorbance(image, source_intensity=255.0, dtype=cp.float32):
     -----
     If `image` has an integer dtype it will be clipped to range
     ``[1, source_intensity]``, while float image inputs are clipped to range
-    ``[source_intensity/255, source_intensity]. The minimum is to avoid log(0).
-    Absorbance is then given by
-
-    .. math::
-
-        absorbance = \\log{\\frac{image}{source_intensity}}.
+    ``[source_intensity/255, source_intensity]``.
+    The minimum is to avoid log(0). Absorbance is then given by
+    ``absorbance = log(image / source_intensity)``.
     """
     dtype = cp.dtype(dtype)
     if dtype.kind != "f":
@@ -209,9 +205,7 @@ def _covariance(a):
     ddof = 1
     fact = X.shape[1] - ddof
     if fact <= 0:
-        warnings.warn("Degrees of freedom <= 0 for slice",
-                      RuntimeWarning, stacklevel=2)
-        fact = 0.0
+        raise RuntimeError("Degrees of freedom <= 0")
 
     X -= X.mean(axis=1, keepdims=True)
     if not X.flags.f_contiguous:
@@ -314,10 +308,10 @@ def stain_extraction_pca(image, source_intensity=240, alpha=1, beta=0.345,
     )
 
     # remove transparent pixels
-    absorbance = absorbance[:, cp.all(absorbance > beta, axis=0)]
-    if absorbance.size == 0:
+    absorbance = absorbance[:, cp.any(absorbance > beta, axis=0)]
+    if absorbance.size == 0 or absorbance.shape[1] <= 1:
         raise ValueError(
-            "All pixels of the input image are below the threshold."
+            "Multiple pixels of the input must be above the `beta` threshold."
         )
 
     # compute eigenvectors (do small 3x3 matrix calculations on the host)
@@ -383,7 +377,10 @@ def _get_raw_concentrations(src_stain_coeff, absorbance):
 def _normalized_from_concentrations(conc_raw, max_percentile, ref_stain_coeff,
                                     ref_max_conc, source_intensity,
                                     original_shape, channel_axis):
-    """Determine normalized image from concentrations."""
+    """Determine normalized image from concentrations.
+
+    Note: This function will also modify conc_raw in-place.
+    """
 
     # verify conc_raw is shape (2, n_pixels)
     if conc_raw.ndim != 2 or conc_raw.shape[0] != 2:
@@ -410,10 +407,10 @@ def _normalized_from_concentrations(conc_raw, max_percentile, ref_stain_coeff,
          for ch_raw in conc_raw]
     )
     normalization_factors = ref_max_conc / max_conc
-    conc_norm = conc_raw * normalization_factors[:, cp.newaxis]
+    conc_raw *= normalization_factors[:, cp.newaxis]
 
     # reconstruct the image based on the reference stain matrix
-    absorbance_norm = ref_stain_coeff.dot(conc_norm)
+    absorbance_norm = ref_stain_coeff.dot(conc_raw)
     image_norm = absorbance_to_image(
         absorbance_norm, source_intensity=source_intensity, dtype=np.uint8
     )
@@ -527,8 +524,8 @@ def normalize_colors_pca(
     image_norm = _normalized_from_concentrations(
         conc_raw=conc_raw,
         max_percentile=100 - alpha,
-        ref_max_conc=cp.asarray(ref_max_conc),
-        ref_stain_coeff=cp.asarray(ref_stain_coeff),
+        ref_max_conc=cp.asarray(ref_max_conc, dtype=conc_raw.dtype),
+        ref_stain_coeff=cp.asarray(ref_stain_coeff, dtype=conc_raw.dtype),
         source_intensity=source_intensity,
         channel_axis=channel_axis,
         original_shape=image.shape,
diff --git a/python/cucim/src/cucim/core/operations/morphology/__init__.py b/python/cucim/src/cucim/core/operations/morphology/__init__.py
new file mode 100644
index 000000000..a9c676edb
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/__init__.py
@@ -0,0 +1,3 @@
+from ._distance_transform import distance_transform_edt
+
+__all__ = ["distance_transform_edt"]
diff --git a/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
new file mode 100644
index 000000000..70523c5bf
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_distance_transform.py
@@ -0,0 +1,183 @@
+import numpy as np
+
+from ._pba_2d import _pba_2d
+from ._pba_3d import _pba_3d
+
+# TODO: support sampling distances
+#       support the distances and indices output arguments
+#       support chamfer, chessboard and l1/manhattan distances too?
+
+
+def distance_transform_edt(image, sampling=None, return_distances=True,
+                           return_indices=False, distances=None, indices=None,
+                           *, block_params=None, float64_distances=False):
+    r"""Exact Euclidean distance transform.
+
+    This function calculates the distance transform of the `input`, by
+    replacing each foreground (non-zero) element, with its shortest distance to
+    the background (any zero-valued element).
+
+    In addition to the distance transform, the feature transform can be
+    calculated. In this case the index of the closest background element to
+    each foreground element is returned in a separate array.
+
+    Parameters
+    ----------
+    image : array_like
+        Input data to transform. Can be any type but will be converted into
+        binary: 1 wherever image equates to True, 0 elsewhere.
+    sampling : float, or sequence of float, optional
+        Spacing of elements along each dimension. If a sequence, must be of
+        length equal to the image rank; if a single number, this is used for
+        all axes. If not specified, a grid spacing of unity is implied.
+    return_distances : bool, optional
+        Whether to calculate the distance transform.
+    return_indices : bool, optional
+        Whether to calculate the feature transform.
+    distances : float32 cupy.ndarray, optional
+        An output array to store the calculated distance transform, instead of
+        returning it. `return_distances` must be True. It must be the same
+        shape as `image`.
+    indices : int32 cupy.ndarray, optional
+        An output array to store the calculated feature transform, instead of
+        returning it. `return_indicies` must be True. Its shape must be
+        `(image.ndim,) + image.shape`.
+
+    Other Parameters
+    ----------------
+    block_params : 3-tuple of int
+        The m1, m2, m3 algorithm parameters as described in [2]_. If None,
+        suitable defaults will be chosen. Note: This parameter is specific to
+        cuCIM and does not exist in SciPy.
+    float64_distances : bool, optional
+        If True, use double precision in the distance computation (to match
+        SciPy behavior). Otherwise, single precision will be used for
+        efficiency. Note: This parameter is specific to cuCIM and does not
+        exist in SciPy.
+
+    Returns
+    -------
+    distances : float64 ndarray, optional
+        The calculated distance transform. Returned only when
+        `return_distances` is True and `distances` is not supplied. It will
+        have the same shape as `image`.
+    indices : int32 ndarray, optional
+        The calculated feature transform. It has an image-shaped array for each
+        dimension of the image. See example below. Returned only when
+        `return_indices` is True and `indices` is not supplied.
+
+    Notes
+    -----
+    The Euclidean distance transform gives values of the Euclidean distance.
+
+    .. math::
+
+      y_i = \sqrt{\sum_{i}^{n} (x[i] - b[i])^2}
+
+    where :math:`b[i]` is the background point (value 0) with the smallest
+    Euclidean distance to input points :math:`x[i]`, and :math:`n` is the
+    number of dimensions.
+
+    Note that the `indices` output may differ from the one given by
+    :func:`scipy.ndimage.distance_transform_edt` in the case of input pixels
+    that are equidistant from multiple background points.
+
+    The parallel banding algorithm implemented here was originally described in
+    [1]_. The kernels used here correspond to the revised PBA+ implementation
+    that is described on the author's website [2]_. The source code of the
+    author's PBA+ implementation is available at [3]_.
+
+    References
+    ----------
+    .. [1] Thanh-Tung Cao, Ke Tang, Anis Mohamed, and Tiow-Seng Tan. 2010.
+        Parallel Banding Algorithm to compute exact distance transform with the
+        GPU. In Proceedings of the 2010 ACM SIGGRAPH symposium on Interactive
+        3D Graphics and Games (I3D ’10). Association for Computing Machinery,
+        New York, NY, USA, 83–90.
+        DOI:https://doi.org/10.1145/1730804.1730818
+    .. [2] https://www.comp.nus.edu.sg/~tants/pba.html
+    .. [3] https://github.com/orzzzjq/Parallel-Banding-Algorithm-plus
+
+    Examples
+    --------
+    >>> import cupy as cp
+    >>> from cucim.core.operations import morphology
+    >>> a = cp.array(([0,1,1,1,1],
+    ...               [0,0,1,1,1],
+    ...               [0,1,1,1,1],
+    ...               [0,1,1,1,0],
+    ...               [0,1,1,0,0]))
+    >>> morphology.distance_transform_edt(a)
+    array([[ 0.    ,  1.    ,  1.4142,  2.2361,  3.    ],
+           [ 0.    ,  0.    ,  1.    ,  2.    ,  2.    ],
+           [ 0.    ,  1.    ,  1.4142,  1.4142,  1.    ],
+           [ 0.    ,  1.    ,  1.4142,  1.    ,  0.    ],
+           [ 0.    ,  1.    ,  1.    ,  0.    ,  0.    ]])
+
+    With a sampling of 2 units along x, 1 along y:
+
+    >>> morphology.distance_transform_edt(a, sampling=[2,1])
+    array([[ 0.    ,  1.    ,  2.    ,  2.8284,  3.6056],
+           [ 0.    ,  0.    ,  1.    ,  2.    ,  3.    ],
+           [ 0.    ,  1.    ,  2.    ,  2.2361,  2.    ],
+           [ 0.    ,  1.    ,  2.    ,  1.    ,  0.    ],
+           [ 0.    ,  1.    ,  1.    ,  0.    ,  0.    ]])
+
+    Asking for indices as well:
+
+    >>> edt, inds = morphology.distance_transform_edt(a, return_indices=True)
+    >>> inds
+    array([[[0, 0, 1, 1, 3],
+            [1, 1, 1, 1, 3],
+            [2, 2, 1, 3, 3],
+            [3, 3, 4, 4, 3],
+            [4, 4, 4, 4, 4]],
+           [[0, 0, 1, 1, 4],
+            [0, 1, 1, 1, 4],
+            [0, 0, 1, 4, 4],
+            [0, 0, 3, 3, 4],
+            [0, 0, 3, 3, 4]]])
+
+    """
+    if distances is not None:
+        raise NotImplementedError(
+            "preallocated distances image is not supported"
+        )
+    if indices is not None:
+        raise NotImplementedError(
+            "preallocated indices image is not supported"
+        )
+    scalar_sampling = None
+    if sampling is not None:
+        sampling = np.unique(np.atleast_1d(sampling))
+        if len(sampling) == 1:
+            scalar_sampling = float(sampling)
+            sampling = None
+        else:
+            raise NotImplementedError(
+                "non-uniform values in sampling is not currently supported"
+            )
+
+    if image.ndim == 3:
+        pba_func = _pba_3d
+    elif image.ndim == 2:
+        pba_func = _pba_2d
+    else:
+        raise NotImplementedError(
+            "Only 2D and 3D distance transforms are supported.")
+
+    vals = pba_func(
+        image,
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+        block_params=block_params
+    )
+
+    if return_distances and scalar_sampling is not None:
+        vals = (vals[0] * scalar_sampling,) + vals[1:]
+
+    if len(vals) == 1:
+        vals = vals[0]
+
+    return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
new file mode 100644
index 000000000..edb10a983
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -0,0 +1,300 @@
+import math
+import os
+
+import cupy
+
+pba2d_defines_template = """
+
+// MARKER is used to mark blank pixels in the texture.
+// Any uncolored pixels will have x = MARKER.
+// Input texture should have x = MARKER for all pixels other than sites
+#define MARKER      {marker}
+#define BLOCKSIZE   {block_size_2d}
+#define pixel_int2_t {pixel_int2_t}                // typically short2 (int2 for images with > 32k pixels per side)
+#define make_pixel(x, y)  {make_pixel_func}(x, y)  // typically make_short2 (make_int2 images with > 32k pixels per side
+
+"""  # noqa
+
+
+def _init_marker(int_dtype):
+    """use a minimum value that is appropriate to the integer dtype"""
+    if int_dtype == cupy.int16:
+        # marker = cupy.iinfo(int_dtype).min
+        marker = -32768
+    elif int_dtype == cupy.int32:
+        # divide by two so we don't have to promote other intermediate int
+        # variables to 64-bit int
+        marker = -2147483648 // 2
+    else:
+        raise ValueError(
+            "expected int_dtype to be either cupy.int16 or cupy.int32"
+        )
+    return marker
+
+
+@cupy.memoize(True)
+def get_pba2d_src(block_size_2d=64, marker=-32768, pixel_int2_t='short2'):
+    make_pixel_func = 'make_' + pixel_int2_t
+
+    pba2d_code = pba2d_defines_template.format(
+        block_size_2d=block_size_2d,
+        marker=marker,
+        pixel_int2_t=pixel_int2_t,
+        make_pixel_func=make_pixel_func
+    )
+    kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
+    with open(os.path.join(kernel_directory, 'pba_kernels_2d.h'), 'rt') as f:
+        pba2d_kernels = '\n'.join(f.readlines())
+
+    pba2d_code += pba2d_kernels
+    return pba2d_code
+
+
+def _get_block_size(check_warp_size=False):
+    if check_warp_size:
+        dev = cupy.cuda.runtime.getDevice()
+        device_properties = cupy.cuda.runtime.getDeviceProperties(dev)
+        return int(device_properties['warpSize'])
+    else:
+        return 32
+
+
+def _pack_int2(arr, marker=-32768, int_dtype=cupy.int16):
+    if arr.ndim != 2:
+        raise ValueError("only 2d arr suppported")
+    input_x = cupy.zeros(arr.shape, dtype=int_dtype)
+    input_y = cupy.zeros(arr.shape, dtype=int_dtype)
+    # TODO: create custom kernel for setting values in input_x, input_y
+    cond = arr == 0
+    y, x = cupy.where(cond)
+    input_x[cond] = x
+    mask = arr != 0
+    input_x[mask] = marker  # 1 << 32
+    input_y[cond] = y
+    input_y[mask] = marker  # 1 << 32
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2})
+    # in C++ code x is the contiguous axis and corresponds to width
+    #             y is the non-contiguous axis and corresponds to height
+    # given that, store input_x as the last axis here
+    return cupy.squeeze(
+        cupy.stack((input_x, input_y), axis=-1).view(int2_dtype)
+    )
+
+
+def _unpack_int2(img, make_copy=False, int_dtype=cupy.int16):
+    temp = img.view(int_dtype).reshape(img.shape + (2,))
+    if make_copy:
+        temp = temp.copy()
+    return temp
+
+
+def _determine_padding(shape, padded_size, block_size):
+    # all kernels assume equal size along both axes, so pad up to equal size if
+    # shape is not isotropic
+    orig_sy, orig_sx = shape
+    if orig_sx != padded_size or orig_sy != padded_size:
+        padding_width = (
+            (0, padded_size - orig_sy), (0, padded_size - orig_sx)
+        )
+    else:
+        padding_width = None
+    return padding_width
+
+
+def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
+            block_params=None, check_warp_size=False, *,
+            float64_distances=False):
+
+    # input_arr: a 2D image
+    #    For each site at (x, y), the pixel at coordinate (x, y) should contain
+    #    the pair (x, y). Pixels that are not sites should contain the pair
+    #    (MARKER, MARKER)
+
+    # Note: could query warp size here, but for now just assume 32 to avoid
+    #       overhead of querying properties
+    block_size = _get_block_size(check_warp_size)
+
+    if sampling is not None:
+        raise NotImplementedError("sampling not yet supported")
+        # if len(sampling) != 2:
+        #     raise ValueError("sampling must be a sequence of two values.")
+
+    padded_size = math.ceil(max(arr.shape) / block_size) * block_size
+    if block_params is None:
+        # should be <= size / 64. sy must be a multiple of m1
+        m1 = max(1, min(padded_size // block_size, 32))
+
+        # size must be a multiple of m2
+        m2 = max(1, min(padded_size // block_size, 32))
+        # m2 must also be a power of two
+        m2 = 2**math.floor(math.log2(m2))
+        if padded_size % m2 != 0:
+            raise RuntimeError("error in setting default m2")
+
+        # should be <= 64. texture size must be a multiple of m3
+        m3 = min(min(m1, m2), 2)
+    else:
+        m1, m2, m3 = block_params
+
+    if m1 > padded_size // block_size:
+        raise ValueError("m1 too large. must be <= arr.shape[0] // 32")
+    if m2 > padded_size // block_size:
+        raise ValueError("m2 too large. must be <= arr.shape[1] // 32")
+    for m in (m1, m2, m3):
+        if padded_size % m != 0:
+            raise ValueError(
+                f"Largest dimension of image ({padded_size}) must be evenly "
+                f"disivible by each element of block_params: {(m1, m2, m3)}."
+            )
+
+    shape_max = max(arr.shape)
+    if shape_max <= 32768:
+        int_dtype = cupy.int16
+        pixel_int2_type = 'short2'
+    else:
+        if shape_max > (1 << 24):
+            # limit to coordinate range to 2**24 due to use of __mul24 in
+            # coordinate TOID macro
+            raise ValueError(
+                f"maximum axis size of {1 << 24} exceeded, for image with "
+                f"shape {arr.shape}"
+            )
+        int_dtype = cupy.int32
+        pixel_int2_type = 'int2'
+
+    marker = _init_marker(int_dtype)
+
+    orig_sy, orig_sx = arr.shape
+    padding_width = _determine_padding(arr.shape, padded_size, block_size)
+    if padding_width is not None:
+        arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1)
+    size = arr.shape[0]
+
+    input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype)
+    output = cupy.zeros_like(input_arr)
+
+    int2_dtype = cupy.dtype({'names': ['x', 'y'], 'formats': [int_dtype] * 2})
+    margin = cupy.empty((2 * m1 * size,), dtype=int2_dtype)
+
+    # phase 1 of PBA. m1 must divide texture size and be <= 64
+    pba2d = cupy.RawModule(
+        code=get_pba2d_src(
+            block_size_2d=block_size,
+            marker=marker,
+            pixel_int2_t=pixel_int2_type,
+        )
+    )
+    kernelFloodDown = pba2d.get_function('kernelFloodDown')
+    kernelFloodUp = pba2d.get_function('kernelFloodUp')
+    kernelPropagateInterband = pba2d.get_function('kernelPropagateInterband')
+    kernelUpdateVertical = pba2d.get_function('kernelUpdateVertical')
+    kernelProximatePoints = pba2d.get_function('kernelProximatePoints')
+    kernelCreateForwardPointers = pba2d.get_function(
+        'kernelCreateForwardPointers'
+    )
+    kernelMergeBands = pba2d.get_function('kernelMergeBands')
+    kernelDoubleToSingleList = pba2d.get_function('kernelDoubleToSingleList')
+    kernelColor = pba2d.get_function('kernelColor')
+
+    block = (block_size, 1, 1)
+    grid = (math.ceil(size / block[0]), m1, 1)
+    bandSize1 = size // m1
+    # kernelFloodDown modifies input_arr in-place
+    kernelFloodDown(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize1),
+    )
+    # kernelFloodUp modifies input_arr in-place
+    kernelFloodUp(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize1),
+    )
+    # kernelFloodUp fills values into margin
+    kernelPropagateInterband(
+        grid,
+        block,
+        (input_arr, margin, size, bandSize1),
+    )
+    # kernelUpdateVertical stores output into an intermediate array of
+    # transposed shape
+    kernelUpdateVertical(
+        grid,
+        block,
+        (input_arr, margin, output, size, bandSize1),
+    )
+
+    # phase 2
+    block = (block_size, 1, 1)
+    grid = (math.ceil(size / block[0]), m2, 1)
+    bandSize2 = size // m2
+    kernelProximatePoints(
+        grid,
+        block,
+        (output, input_arr, size, bandSize2),
+    )
+    kernelCreateForwardPointers(
+        grid,
+        block,
+        (input_arr, input_arr, size, bandSize2),
+    )
+    # Repeatly merging two bands into one
+    noBand = m2
+    while noBand > 1:
+        grid = (math.ceil(size / block[0]), noBand // 2)
+        kernelMergeBands(
+            grid,
+            block,
+            (output, input_arr, input_arr, size, size // noBand),
+        )
+        noBand //= 2
+    # Replace the forward link with the X coordinate of the seed to remove
+    # the need of looking at the other texture. We need it for coloring.
+    grid = (math.ceil(size / block[0]), size)
+    kernelDoubleToSingleList(
+        grid,
+        block,
+        (output, input_arr, input_arr, size),
+    )
+
+    # Phase 3 of PBA
+    block = (block_size, m3, 1)
+    grid = (math.ceil(size / block[0]), 1, 1)
+    kernelColor(
+        grid,
+        block,
+        (input_arr, output, size),
+    )
+
+    output = _unpack_int2(output, make_copy=False, int_dtype=int_dtype)
+    # make sure to crop any padding that was added here!
+    x = output[:orig_sy, :orig_sx, 0]
+    y = output[:orig_sy, :orig_sx, 1]
+
+    # raise NotImplementedError("TODO")
+    vals = ()
+    if return_distances:
+        # TODO: custom kernel for more efficient distance computation
+        y0, x0 = cupy.meshgrid(
+            *(
+                cupy.arange(s, dtype=cupy.int32)
+                for s in (orig_sy, orig_sx)
+            ),
+            indexing='ij',
+            sparse=True,
+        )
+        tmp = (x - x0)
+        dist = tmp * tmp
+        tmp = (y - y0)
+        dist += tmp * tmp
+        if float64_distances:
+            dist = cupy.sqrt(dist)
+        else:
+            dist = dist.astype(cupy.float32)
+            cupy.sqrt(dist, out=dist)
+        vals = vals + (dist,)
+    if return_indices:
+        indices = cupy.stack((y, x), axis=0)
+        vals = vals + (indices,)
+    return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
new file mode 100644
index 000000000..dab484e90
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -0,0 +1,311 @@
+import functools
+import math
+import numbers
+import os
+
+import cupy
+import numpy as np
+
+from ._pba_2d import _get_block_size
+
+try:
+    # math.lcm was introduced in Python 3.9
+    from math import lcm
+except ImportError:
+
+    """Fallback implementation of least common multiple (lcm)
+
+    TODO: remove once minimum Python requirement is >= 3.9
+    """
+
+    def _lcm(a, b):
+        return abs(b * (a // math.gcd(a, b)))
+
+    @functools.lru_cache()
+    def lcm(*args):
+        nargs = len(args)
+        if not all(isinstance(a, numbers.Integral) for a in args):
+            raise TypeError("all arguments must be integers")
+        if nargs == 0:
+            return 1
+        res = int(args[0])
+        if nargs == 1:
+            return abs(res)
+        for i in range(1, nargs):
+            x = int(args[i])
+            res = _lcm(res, x)
+        return res
+
+
+pba3d_defines_template = """
+
+#define MARKER     {marker}
+#define MAX_INT    {max_int}
+#define BLOCKSIZE  {block_size_3d}
+
+"""
+
+# For efficiency, the original PBA+ packs three 10-bit integers and two binary
+# flags into a single 32-bit integer. The defines in
+# `pba3d_defines_encode_32bit` handle this format.
+pba3d_defines_encode_32bit = """
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODED_INT_TYPE int
+#define ZERO 0
+#define ONE 1
+#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 20) & 0x3ff; \
+    y = ((value) >> 10) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 31) & 1)
+#define HASNEXT(value)  (((value) >> 30) & 1)
+
+#define GET_X(value)    (((value) >> 20) & 0x3ff)
+#define GET_Y(value)    (((value) >> 10) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+""" # noqa
+
+
+# 64bit version of ENCODE/DECODE to allow a 20-bit integer per coordinate axis.
+pba3d_defines_encode_64bit = """
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODED_INT_TYPE long long
+#define ZERO 0L
+#define ONE 1L
+#define ENCODE(x, y, z, a, b)  (((x) << 40) | ((y) << 20) | (z) | ((a) << 61) | ((b) << 60))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 40) & 0xfffff; \
+    y = ((value) >> 20) & 0xfffff; \
+    z = (value) & 0xfffff
+
+#define NOTSITE(value)  (((value) >> 61) & 1)
+#define HASNEXT(value)  (((value) >> 60) & 1)
+
+#define GET_X(value)    (((value) >> 40) & 0xfffff)
+#define GET_Y(value)    (((value) >> 20) & 0xfffff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0xfffff))
+
+""" # noqa
+
+
+@cupy.memoize(True)
+def get_pba3d_src(block_size_3d=32, marker=-2147483648, max_int=2147483647,
+                  size_max=1024):
+    pba3d_code = pba3d_defines_template.format(
+        block_size_3d=block_size_3d, marker=marker, max_int=max_int
+    )
+    if size_max > 1024:
+        pba3d_code += pba3d_defines_encode_64bit
+    else:
+        pba3d_code += pba3d_defines_encode_32bit
+    kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
+    with open(os.path.join(kernel_directory, 'pba_kernels_3d.h'), 'rt') as f:
+        pba3d_kernels = '\n'.join(f.readlines())
+    pba3d_code += pba3d_kernels
+    return pba3d_code
+
+
+# TODO: custom kernel for encode3d
+def encode3d(arr, marker=-2147483648, bit_depth=32, size_max=1024):
+    if arr.ndim != 3:
+        raise ValueError("only 3d arr suppported")
+    if bit_depth not in [32, 64]:
+        raise ValueError("only bit_depth of 32 or 64 is supported")
+    if size_max > 1024:
+        dtype = np.int64
+    else:
+        dtype = np.int32
+    image = cupy.zeros(arr.shape, dtype=dtype, order='C')
+    cond = arr == 0
+    z, y, x = cupy.where(cond)
+    # z, y, x so that x is the contiguous axis
+    # (must match TOID macro in the C++ code!)
+    if size_max > 1024:
+        image[cond] = (((x) << 40) | ((y) << 20) | (z))
+    else:
+        image[cond] = (((x) << 20) | ((y) << 10) | (z))
+    image[arr != 0] = marker  # 1 << 32
+    return image
+
+
+# TODO: custom kernel for decode3d
+def decode3d(output, size_max=1024):
+    if size_max > 1024:
+        x = (output >> 40) & 0xfffff
+        y = (output >> 20) & 0xfffff
+        z = output & 0xfffff
+    else:
+        x = (output >> 20) & 0x3ff
+        y = (output >> 10) & 0x3ff
+        z = output & 0x3ff
+    return (x, y, z)
+
+
+def _determine_padding(shape, block_size, m1, m2, m3, blockx, blocky):
+    # TODO: can possibly revise to consider only particular factors for LCM on
+    #       a given axis
+    LCM = lcm(block_size, m1, m2, m3, blockx, blocky)
+    orig_sz, orig_sy, orig_sx = shape
+    round_up = False
+    if orig_sx % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sx = LCM * math.ceil(orig_sx / LCM)
+    else:
+        sx = orig_sx
+    if orig_sy % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sy = LCM * math.ceil(orig_sy / LCM)
+    else:
+        sy = orig_sy
+    if orig_sz % LCM != 0:
+        # round up size to a multiple of the band size
+        round_up = True
+        sz = LCM * math.ceil(orig_sz / LCM)
+    else:
+        sz = orig_sz
+
+    aniso = not (sx == sy == sz)
+    if aniso or round_up:
+        smax = max(sz, sy, sx)
+        padding_width = (
+            (0, smax - orig_sz), (0, smax - orig_sy), (0, smax - orig_sx)
+        )
+    else:
+        padding_width = None
+    return padding_width
+
+
+def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
+            block_params=None, check_warp_size=False, *,
+            float64_distances=False):
+    if arr.ndim != 3:
+        raise ValueError(f"expected a 3D array, got {arr.ndim}D")
+
+    if sampling is not None:
+        raise NotImplementedError("sampling not yet supported")
+        # if len(sampling) != 3:
+        #     raise ValueError("sampling must be a sequence of three values.")
+
+    if block_params is None:
+        m1 = 1
+        m2 = 1
+        m3 = 2
+    else:
+        m1, m2, m3 = block_params
+
+    # reduce blockx for small inputs
+    s_min = min(arr.shape)
+    if s_min <= 4:
+        blockx = 4
+    elif s_min <= 8:
+        blockx = 8
+    elif s_min <= 16:
+        blockx = 16
+    else:
+        blockx = 32
+    blocky = 4
+
+    block_size = _get_block_size(check_warp_size)
+
+    orig_sz, orig_sy, orig_sx = arr.shape
+    padding_width = _determine_padding(
+        arr.shape, block_size, m1, m2, m3, blockx, blocky
+    )
+    if padding_width is not None:
+        arr = cupy.pad(arr, padding_width, mode='constant', constant_values=1)
+    size = arr.shape[0]
+
+    # pba algorithm was implemented to use 32-bit integer to store compressed
+    # coordinates. input_arr will be C-contiguous, int32
+    size_max = max(arr.shape)
+    input_arr = encode3d(arr, size_max=size_max)
+    buffer_idx = 0
+    output = cupy.zeros_like(input_arr)
+    pba_images = [input_arr, output]
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+    pba3d = cupy.RawModule(
+        code=get_pba3d_src(block_size_3d=block_size, size_max=size_max)
+    )
+
+    kernelFloodZ = pba3d.get_function('kernelFloodZ')
+    kernelMaurerAxis = pba3d.get_function('kernelMaurerAxis')
+    kernelColorAxis = pba3d.get_function('kernelColorAxis')
+
+    kernelFloodZ(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size)
+    )
+    buffer_idx = 1 - buffer_idx
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+    kernelMaurerAxis(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size),
+    )
+
+    block = (block_size, m3, 1)
+    grid = (size // block[0], size, 1)
+    kernelColorAxis(
+        grid,
+        block,
+        (pba_images[1 - buffer_idx], pba_images[buffer_idx], size),
+    )
+
+    block = (blockx, blocky, 1)
+    grid = (size // block[0], size // block[1], 1)
+    kernelMaurerAxis(
+        grid,
+        block,
+        (pba_images[buffer_idx], pba_images[1 - buffer_idx], size),
+    )
+
+    block = (block_size, m3, 1)
+    grid = (size // block[0], size, 1)
+    kernelColorAxis(
+        grid,
+        block,
+        (pba_images[1 - buffer_idx], pba_images[buffer_idx], size),
+    )
+
+    output = pba_images[buffer_idx]
+    if return_distances or return_indices:
+        x, y, z = decode3d(output[:orig_sz, :orig_sy, :orig_sx],
+                           size_max=size_max)
+
+    vals = ()
+    if return_distances:
+        # TODO: custom kernel for more efficient distance computation
+        orig_shape = (orig_sz, orig_sy, orig_sx)
+        z0, y0, x0 = cupy.meshgrid(
+            *(cupy.arange(s, dtype=cupy.int32) for s in orig_shape),
+            indexing='ij',
+            sparse=True
+        )
+        tmp = (x - x0)
+        dist = tmp * tmp
+        tmp = (y - y0)
+        dist += tmp * tmp
+        tmp = (z - z0)
+        dist += tmp * tmp
+        if float64_distances:
+            dist = cupy.sqrt(dist)
+        else:
+            dist = dist.astype(cupy.float32)
+            cupy.sqrt(dist, out=dist)
+        vals = vals + (dist,)
+    if return_indices:
+        indices = cupy.stack((z, y, x), axis=0)
+        vals = vals + (indices,)
+    return vals
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
new file mode 100644
index 000000000..61677c682
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_2d.h
@@ -0,0 +1,451 @@
+// Euclidean Distance Transform
+// 
+// Kernels for the 2D version of the Parallel Banding Algorithm (PBA+). 
+// 
+// MIT license: see 3rdparty/LICENSE.pba+
+// Copyright: (c) 2019 School of Computing, National University of Singapore
+//
+// Modifications by Gregory Lee (2022) (NVIDIA)
+// - add user-defined pixel_int2_t to enable
+// - replace __mul24 operations with standard multiplication operator
+
+
+// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
+
+// The values included in this header file are those defined in the original
+// PBA+ implementation
+
+// However, the Python code generation can potentially generate a different
+// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per
+// coordinate with ENCODED_INT_TYPE as `long long`.
+
+#ifndef MARKER
+#define MARKER -32768
+#endif
+
+#ifndef BLOCKSIZE
+#define BLOCKSIZE 32
+#endif
+
+#ifndef pixel_int2_t
+#define pixel_int2_t short2
+#define make_pixel(x, y)  make_short2(x, y)
+#endif
+
+// END OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
+
+
+#define TOID(x, y, size)  ((y) * (size) + (x))
+
+#define LL long long
+__device__ bool dominate(LL x1, LL y1, LL x2, LL y2, LL x3, LL y3, LL x0)
+{
+    LL k1 = y2 - y1, k2 = y3 - y2;
+    return (k1 * (y1 + y2) + (x2 - x1) * ((x1 + x2) - (x0 << 1))) * k2 > \
+            (k2 * (y2 + y3) + (x3 - x2) * ((x2 + x3) - (x0 << 1))) * k1;
+}
+#undef LL
+
+
+extern "C"{
+
+__global__ void kernelFloodDown(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+    int id = TOID(tx, ty, size);
+
+    pixel_int2_t pixel1, pixel2;
+
+    pixel1 = make_pixel(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        pixel2 = input[id];
+
+        if (pixel2.x != MARKER)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelFloodUp(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y+1) * bandSize - 1;
+    int id = TOID(tx, ty, size);
+
+    pixel_int2_t pixel1, pixel2;
+    int dist1, dist2;
+
+    pixel1 = make_pixel(MARKER, MARKER);
+
+    for (int i = 0; i < bandSize; i++, id -= size) {
+        dist1 = abs(pixel1.y - ty + i);
+
+        pixel2 = input[id];
+        dist2 = abs(pixel2.y - ty + i);
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+__global__ void kernelPropagateInterband(pixel_int2_t *input, pixel_int2_t *margin_out, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int inc = bandSize * size;
+    int ny, nid, nDist;
+    pixel_int2_t pixel;
+
+    // Top row, look backward
+    int ty = blockIdx.y * bandSize;
+    int topId = TOID(tx, ty, size);
+    int bottomId = TOID(tx, ty + bandSize - 1, size);
+    int tid = blockIdx.y * size + tx;
+    int bid = tid + (size * size / bandSize);
+
+    pixel = input[topId];
+    int myDist = abs(pixel.y - ty);
+    margin_out[tid] = pixel;
+
+    for (nid = bottomId - inc; nid >= 0; nid -= inc) {
+        pixel = input[nid];
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+
+            if (nDist < myDist)
+                margin_out[tid] = pixel;
+
+            break;
+        }
+    }
+
+    // Last row, look downward
+    ty = ty + bandSize - 1;
+    pixel = input[bottomId];
+    myDist = abs(pixel.y - ty);
+    margin_out[bid] = pixel;
+
+    for (ny = ty + 1, nid = topId + inc; ny < size; ny += bandSize, nid += inc) {
+        pixel = input[nid];
+
+        if (pixel.x != MARKER) {
+            nDist = abs(pixel.y - ty);
+
+            if (nDist < myDist)
+                margin_out[bid] = pixel;
+
+            break;
+        }
+    }
+}
+
+__global__ void kernelUpdateVertical(pixel_int2_t *color, pixel_int2_t *margin, pixel_int2_t *output, int size, int bandSize)
+{
+    __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE];
+
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+
+    pixel_int2_t top = margin[blockIdx.y * size + tx];
+    pixel_int2_t bottom = margin[(blockIdx.y + size / bandSize) * size + tx];
+    pixel_int2_t pixel;
+
+    int dist, myDist;
+
+    int id = TOID(tx, ty, size);
+
+    int n_step = bandSize / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        int y_start = blockIdx.y * bandSize + step * blockDim.x;
+        int y_end = y_start + blockDim.x;
+
+        for (ty = y_start; ty < y_end; ++ty, id += size) {
+            pixel = color[id];
+            myDist = abs(pixel.y - ty);
+
+            dist = abs(top.y - ty);
+            if (dist < myDist) { myDist = dist; pixel = top; }
+
+            dist = abs(bottom.y - ty);
+            if (dist < myDist) pixel = bottom;
+
+            // temporary result is stored in block
+            block[threadIdx.x][ty - y_start] = make_pixel(pixel.y, pixel.x);
+        }
+
+        __syncthreads();
+
+        // block is written to a transposed location in the output
+
+        int tid = TOID(blockIdx.y * bandSize + step * blockDim.x + threadIdx.x, \
+                        blockIdx.x * blockDim.x, size);
+
+        for(int i = 0; i < blockDim.x; ++i, tid += size) {
+            output[tid] = block[i][threadIdx.x];
+        }
+
+        __syncthreads();
+    }
+}
+
+__global__ void kernelProximatePoints(pixel_int2_t *input, pixel_int2_t *stack, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * bandSize;
+    int id = TOID(tx, ty, size);
+    int lasty = -1;
+    pixel_int2_t last1, last2, current;
+
+    last1.y = -1; last2.y = -1;
+
+    for (int i = 0; i < bandSize; i++, id += size) {
+        current = input[id];
+
+        if (current.x != MARKER) {
+            while (last2.y >= 0) {
+                if (!dominate(last1.x, last2.y, last2.x, \
+                    lasty, current.x, current.y, tx))
+                    break;
+
+                lasty = last2.y; last2 = last1;
+
+                if (last1.y >= 0)
+                    last1 = stack[TOID(tx, last1.y, size)];
+            }
+
+            last1 = last2; last2 = make_pixel(current.x, lasty); lasty = current.y;
+
+            stack[id] = last2;
+        }
+    }
+
+    // Store the pointer to the tail at the last pixel of this band
+    if (lasty != ty + bandSize - 1)
+        stack[TOID(tx, ty + bandSize - 1, size)] = make_pixel(MARKER, lasty);
+}
+
+__global__ void kernelCreateForwardPointers(pixel_int2_t *input, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = (blockIdx.y+1) * bandSize - 1;
+    int id = TOID(tx, ty, size);
+    int lasty = -1, nexty;
+    pixel_int2_t current;
+
+    // Get the tail pointer
+    current = input[id];
+
+    if (current.x == MARKER)
+        nexty = current.y;
+    else
+        nexty = ty;
+
+    for (int i = 0; i < bandSize; i++, id -= size)
+        if (ty - i == nexty) {
+            current = make_pixel(lasty, input[id].y);
+            output[id] = current;
+
+            lasty = nexty;
+            nexty = current.y;
+        }
+
+    // Store the pointer to the head at the first pixel of this band
+    if (lasty != ty - bandSize + 1)
+        output[id + size] = make_pixel(lasty, MARKER);
+}
+
+__global__ void kernelMergeBands(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size, int bandSize)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int band1 = blockIdx.y * 2;
+    int band2 = band1 + 1;
+    int firsty, lasty;
+    pixel_int2_t last1, last2, current;
+    // last1 and last2: x component store the x coordinate of the site,
+    // y component store the backward pointer
+    // current: y component store the x coordinate of the site,
+    // x component store the forward pointer
+
+    // Get the two last items of the first list
+    lasty = band2 * bandSize - 1;
+    last2 = make_pixel(color[TOID(tx, lasty, size)].x,
+        link[TOID(tx, lasty, size)].y);
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+
+        if (lasty >= 0)
+            last2 = make_pixel(color[TOID(tx, lasty, size)].x,
+            link[TOID(tx, lasty, size)].y);
+        else
+            last2 = make_pixel(MARKER, MARKER);
+    }
+
+    if (last2.y >= 0) {
+        // Second item at the top of the stack
+        last1 = make_pixel(color[TOID(tx, last2.y, size)].x,
+            link[TOID(tx, last2.y, size)].y);
+    }
+
+    // Get the first item of the second band
+    firsty = band2 * bandSize;
+    current = make_pixel(link[TOID(tx, firsty, size)].x,
+        color[TOID(tx, firsty, size)].x);
+
+    if (current.y == MARKER) {
+        firsty = current.x;
+
+        if (firsty >= 0)
+            current = make_pixel(link[TOID(tx, firsty, size)].x,
+            color[TOID(tx, firsty, size)].x);
+        else
+            current = make_pixel(MARKER, MARKER);
+    }
+
+    // Count the number of item in the second band that survive so far.
+    // Once it reaches 2, we can stop.
+    int top = 0;
+
+    while (top < 2 && current.y >= 0) {
+        // While there's still something on the left
+        while (last2.y >= 0) {
+
+            if (!dominate(last1.x, last2.y, last2.x, \
+                lasty, current.y, firsty, tx))
+                break;
+
+            lasty = last2.y; last2 = last1;
+            top--;
+
+            if (last1.y >= 0)
+                last1 = make_pixel(color[TOID(tx, last1.y, size)].x,
+                link[TOID(tx, last1.y, size)].y);
+        }
+
+        // Update the current pointer
+        output[TOID(tx, firsty, size)] = make_pixel(current.x, lasty);
+
+        if (lasty >= 0)
+            output[TOID(tx, lasty, size)] = make_pixel(firsty, last2.y);
+
+        last1 = last2; last2 = make_pixel(current.y, lasty); lasty = firsty;
+        firsty = current.x;
+
+        top = max(1, top + 1);
+
+        // Advance the current pointer to the next one
+        if (firsty >= 0)
+            current = make_pixel(link[TOID(tx, firsty, size)].x,
+            color[TOID(tx, firsty, size)].x);
+        else
+            current = make_pixel(MARKER, MARKER);
+    }
+
+    // Update the head and tail pointer.
+    firsty = band1 * bandSize;
+    lasty = band2 * bandSize;
+    current = link[TOID(tx, firsty, size)];
+
+    if (current.y == MARKER && current.x < 0) { // No head?
+        last1 = link[TOID(tx, lasty, size)];
+
+        if (last1.y == MARKER)
+            current.x = last1.x;
+        else
+            current.x = lasty;
+
+        output[TOID(tx, firsty, size)] = current;
+    }
+
+    firsty = band1 * bandSize + bandSize - 1;
+    lasty = band2 * bandSize + bandSize - 1;
+    current = link[TOID(tx, lasty, size)];
+
+    if (current.x == MARKER && current.y < 0) { // No tail?
+        last1 = link[TOID(tx, firsty, size)];
+
+        if (last1.x == MARKER)
+            current.y = last1.y;
+        else
+            current.y = firsty;
+
+        output[TOID(tx, lasty, size)] = current;
+    }
+}
+
+__global__ void kernelDoubleToSingleList(pixel_int2_t *color, pixel_int2_t *link, pixel_int2_t *output, int size)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y;
+    int id = TOID(tx, ty, size);
+
+    output[id] = make_pixel(color[id].x, link[id].y);
+}
+
+__global__ void kernelColor(pixel_int2_t *input, pixel_int2_t *output, int size)
+{
+    __shared__ pixel_int2_t block[BLOCKSIZE][BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = blockIdx.x * blockDim.x + col;
+    int dx, dy, lasty;
+    unsigned int best, dist;
+    pixel_int2_t last1, last2;
+
+    lasty = size - 1;
+
+    last2 = input[TOID(tx, lasty, size)];
+
+    if (last2.x == MARKER) {
+        lasty = last2.y;
+        last2 = input[TOID(tx, lasty, size)];
+    }
+
+    if (last2.y >= 0)
+        last1 = input[TOID(tx, last2.y, size)];
+
+    int y_start, y_end, n_step = size / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        y_start = size - step * blockDim.x - 1;
+        y_end = size - (step + 1) * blockDim.x;
+
+        for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) {
+            dx = last2.x - tx; dy = lasty - ty;
+            best = dist = dx * dx + dy * dy;
+
+            while (last2.y >= 0) {
+                dx = last1.x - tx; dy = last2.y - ty;
+                dist = dx * dx + dy * dy;
+
+                if (dist > best)
+                    break;
+
+                best = dist; lasty = last2.y; last2 = last1;
+
+                if (last2.y >= 0)
+                    last1 = input[TOID(tx, last2.y, size)];
+            }
+
+            block[threadIdx.x][ty - y_end] = make_pixel(lasty, last2.x);
+        }
+
+        __syncthreads();
+
+        // note: transposes back to original shape here
+        if(!threadIdx.y) {
+            int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, size);
+            for(int i = 0; i < blockDim.x; ++i, id+=size) {
+                output[id] = block[i][threadIdx.x];
+            }
+        }
+
+        __syncthreads();
+    }
+}
+} // extern C
diff --git a/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
new file mode 100644
index 000000000..c09f4b51f
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/cuda/pba_kernels_3d.h
@@ -0,0 +1,237 @@
+// Euclidean Distance Transform
+// 
+// Kernels for the 3D version of the Parallel Banding Algorithm (PBA+). 
+// 
+// MIT license: see 3rdparty/LICENSE.pba+
+//
+// Modifications by Gregory Lee (2022) (NVIDIA)
+// - allow user-defined ENCODED_INT_TYPE, ENCODE, DECODE
+
+
+// START OF DEFINITIONS OVERRIDDEN BY THE PYTHON SCRIPT
+
+// The values included in this header file are those defined in the original
+// PBA+ implementation
+
+// However, the Python code generation can potentially generate a different
+// ENCODE/DECODE that use 20 bits per coordinates instead of 10 bits per
+// coordinate with ENCODED_INT_TYPE as `long long`.
+
+
+#ifndef MARKER
+#define MARKER     -2147483648
+#endif  // MARKER
+
+#ifndef MAX_INT
+#define MAX_INT    2147483647
+#endif
+
+#ifndef BLOCKSIZE
+#define BLOCKSIZE  32
+#endif
+
+#ifndef ENCODE
+
+// Sites     : ENCODE(x, y, z, 0, 0)
+// Not sites : ENCODE(0, 0, 0, 1, 0) or MARKER
+#define ENCODED_INT_TYPE int
+#define ZERO 0
+#define ONE 1
+#define ENCODE(x, y, z, a, b)  (((x) << 20) | ((y) << 10) | (z) | ((a) << 31) | ((b) << 30))
+#define DECODE(value, x, y, z) \
+    x = ((value) >> 20) & 0x3ff; \
+    y = ((value) >> 10) & 0x3ff; \
+    z = (value) & 0x3ff
+
+#define NOTSITE(value)  (((value) >> 31) & 1)
+#define HASNEXT(value)  (((value) >> 30) & 1)
+
+#define GET_X(value)    (((value) >> 20) & 0x3ff)
+#define GET_Y(value)    (((value) >> 10) & 0x3ff)
+#define GET_Z(value)    ((NOTSITE((value))) ? MAX_INT : ((value) & 0x3ff))
+
+#endif // ENCODE
+
+// END OF DEFINITIONS DEFINED IN THE PYTHON SCRIPT
+
+
+#define LL long long
+__device__ bool dominate(LL x_1, LL y_1, LL z_1, LL x_2, LL y_2, LL z_2, LL x_3, LL y_3, LL z_3, LL x_0, LL z_0)
+{
+    LL k_1 = y_2 - y_1, k_2 = y_3 - y_2;
+
+    return (((y_1 + y_2) * k_1 + ((x_2 - x_1) * (x_1 + x_2 - (x_0 << 1)) + (z_2 - z_1) * (z_1 + z_2 - (z_0 << 1)))) * k_2 > \
+            ((y_2 + y_3) * k_2 + ((x_3 - x_2) * (x_2 + x_3 - (x_0 << 1)) + (z_3 - z_2) * (z_2 + z_3 - (z_0 << 1)))) * k_1);
+}
+#undef LL
+
+#define TOID(x, y, z, size)    ((((z) * (size)) + (y)) * (size) + (x))
+
+
+extern "C"{
+
+__global__ void kernelFloodZ(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size)
+{
+
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int ty = blockIdx.y * blockDim.y + threadIdx.y;
+    int tz = 0;
+
+    int plane = size * size;
+    int id = TOID(tx, ty, tz, size);
+    ENCODED_INT_TYPE pixel1, pixel2;
+
+    pixel1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO);
+
+    // Sweep down
+    for (int i = 0; i < size; i++, id += plane) {
+        pixel2 = input[id];
+
+        if (!NOTSITE(pixel2))
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+
+    ENCODED_INT_TYPE dist1, dist2, nz;
+
+    id -= plane + plane;
+
+    // Sweep up
+    for (int i = size - 2; i >= 0; i--, id -= plane) {
+        nz = GET_Z(pixel1);
+        dist1 = abs(nz - (tz + i));
+
+        pixel2 = output[id];
+        nz = GET_Z(pixel2);
+        dist2 = abs(nz - (tz + i));
+
+        if (dist2 < dist1)
+            pixel1 = pixel2;
+
+        output[id] = pixel1;
+    }
+}
+
+
+__global__ void kernelMaurerAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *stack, int size)
+{
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int tz = blockIdx.y * blockDim.y + threadIdx.y;
+    int ty = 0;
+
+    int id = TOID(tx, ty, tz, size);
+
+    ENCODED_INT_TYPE lasty = 0;
+    ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2, nx, ny, nz;
+    ENCODED_INT_TYPE p = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), s2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO);
+    ENCODED_INT_TYPE flag = 0;
+
+    for (ty = 0; ty < size; ++ty, id += size) {
+        p = input[id];
+
+        if (!NOTSITE(p)) {
+
+            while (HASNEXT(s2)) {
+                DECODE(s1, x1, y1, z1);
+                DECODE(s2, x2, y2, z2);
+                DECODE(p, nx, ny, nz);
+
+                if (!dominate(x1, y2, z1, x2, lasty, z2, nx, ty, nz, tx, tz))
+                    break;
+
+                lasty = y2; s2 = s1; y2 = y1;
+
+                if (HASNEXT(s2))
+                    s1 = stack[TOID(tx, y2, tz, size)];
+            }
+
+            DECODE(p, nx, ny, nz);
+            s1 = s2;
+            s2 = ENCODE(nx, lasty, nz, ZERO, flag);
+            y2 = lasty;
+            lasty = ty;
+
+            stack[id] = s2;
+
+            flag = ONE;
+        }
+    }
+
+    if (NOTSITE(p))
+        stack[TOID(tx, ty - 1, tz, size)] = ENCODE(ZERO, lasty, ZERO, ONE, flag);
+}
+
+__global__ void kernelColorAxis(ENCODED_INT_TYPE *input, ENCODED_INT_TYPE *output, int size)
+{
+    __shared__ ENCODED_INT_TYPE block[BLOCKSIZE][BLOCKSIZE];
+
+    int col = threadIdx.x;
+    int tid = threadIdx.y;
+    int tx = blockIdx.x * blockDim.x + col;
+    int tz = blockIdx.y;
+
+    ENCODED_INT_TYPE x1, y1, z1, x2, y2, z2;
+    ENCODED_INT_TYPE last1 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), last2 = ENCODE(ZERO,ZERO,ZERO,ONE,ZERO), lasty;
+    long long dx, dy, dz, best, dist;
+
+    lasty = size - 1;
+
+    last2 = input[TOID(tx, lasty, tz, size)];
+    DECODE(last2, x2, y2, z2);
+
+    if (NOTSITE(last2)) {
+        lasty = y2;
+        if(HASNEXT(last2)) {
+            last2 = input[TOID(tx, lasty, tz, size)];
+            DECODE(last2, x2, y2, z2);
+        }
+    }
+
+    if (HASNEXT(last2)) {
+        last1 = input[TOID(tx, y2, tz, size)];
+        DECODE(last1, x1, y1, z1);
+    }
+
+    int y_start, y_end, n_step = size / blockDim.x;
+    for(int step = 0; step < n_step; ++step) {
+        y_start = size - step * blockDim.x - 1;
+        y_end = size - (step + 1) * blockDim.x;
+
+        for (int ty = y_start - tid; ty >= y_end; ty -= blockDim.y) {
+            dx = x2 - tx; dy = lasty - ty; dz = z2 - tz;
+            best = dx * dx + dy * dy + dz * dz;
+
+            while (HASNEXT(last2)) {
+                dx = x1 - tx; dy = y2 - ty; dz = z1 - tz;
+                dist = dx * dx + dy * dy + dz * dz;
+
+                if(dist > best) break;
+
+                best = dist; lasty = y2; last2 = last1;
+                DECODE(last2, x2, y2, z2);
+
+                if (HASNEXT(last2)) {
+                    last1 = input[TOID(tx, y2, tz, size)];
+                    DECODE(last1, x1, y1, z1);
+                }
+            }
+
+            block[threadIdx.x][ty - y_end] = ENCODE(lasty, x2, z2, NOTSITE(last2), ZERO);
+        }
+
+        __syncthreads();
+
+        if(!threadIdx.y) {
+            int id = TOID(y_end + threadIdx.x, blockIdx.x * blockDim.x, tz, size);
+            for(int i = 0; i < blockDim.x; i++, id+=size) {
+                output[id] = block[i][threadIdx.x];
+            }
+        }
+
+        __syncthreads();
+    }
+}
+
+
+} // extern C
diff --git a/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
new file mode 100644
index 000000000..ab46ae1fb
--- /dev/null
+++ b/python/cucim/src/cucim/core/operations/morphology/tests/test_distance_transform.py
@@ -0,0 +1,145 @@
+from copy import copy
+
+import cupy as cp
+import numpy as np
+import pytest
+import scipy.ndimage as ndi_cpu
+
+from cucim.core.operations.morphology import distance_transform_edt
+
+
+def binary_image(shape, pct_true=50):
+    rng = cp.random.default_rng(123)
+    x = rng.integers(0, 100, size=shape, dtype=cp.uint8)
+    return x >= pct_true
+
+
+def assert_percentile_equal(arr1, arr2, pct=95):
+    """Assert that at least pct% of the entries in arr1 and arr2 are equal."""
+    pct_mismatch = (100 - pct) / 100
+    arr1 = cp.asnumpy(arr1)
+    arr2 = cp.asnumpy(arr2)
+    mismatch = np.sum(arr1 != arr2) / arr1.size
+    assert mismatch < pct_mismatch
+
+
+@pytest.mark.parametrize('return_indices', [False, True])
+@pytest.mark.parametrize('return_distances', [False, True])
+@pytest.mark.parametrize(
+    'shape, sampling',
+    [
+        ((256, 128), None),
+        ((384, 256), (1.5, 1.5)),
+        ((14, 32, 50), None),
+        ((50, 32, 24), (2, 2, 2)),
+    ]
+)
+@pytest.mark.parametrize('density', [5, 50, 95])
+@pytest.mark.parametrize('block_params', [None, (1, 1, 1)])
+def test_distance_transform_edt(
+    shape, sampling, return_distances, return_indices, density, block_params
+):
+
+    if not (return_indices or return_distances):
+        return
+
+    kwargs_scipy = dict(
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+    )
+    kwargs_cucim = copy(kwargs_scipy)
+    kwargs_cucim['block_params'] = block_params
+    img = binary_image(shape, pct_true=density)
+    out = distance_transform_edt(img, **kwargs_cucim)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img), **kwargs_scipy)
+    if return_indices and return_distances:
+        assert len(out) == 2
+        cp.testing.assert_allclose(out[0], expected[0])
+        # May differ at a small % of coordinates where multiple points were
+        # equidistant.
+        assert_percentile_equal(out[1], expected[1], pct=95)
+    elif return_distances:
+        cp.testing.assert_allclose(out, expected)
+    elif return_indices:
+        assert_percentile_equal(out, expected, pct=95)
+
+
+@pytest.mark.parametrize('return_indices', [False, True])
+@pytest.mark.parametrize('return_distances', [False, True])
+@pytest.mark.parametrize(
+    'shape, sampling',
+    [
+        ((384, 256), (1, 3)),
+        ((50, 32, 24), (1, 2, 4)),
+    ]
+)
+@pytest.mark.parametrize('density', [5, 50, 95])
+def test_distance_transform_edt_nonuniform_sampling(
+    shape, sampling, return_distances, return_indices, density
+):
+
+    if not (return_indices or return_distances):
+        return
+
+    kwargs_scipy = dict(
+        sampling=sampling,
+        return_distances=return_distances,
+        return_indices=return_indices,
+    )
+    kwargs_cucim = copy(kwargs_scipy)
+    img = binary_image(shape, pct_true=density)
+    if sampling is not None and len(np.unique(sampling)) != 1:
+        with pytest.raises(NotImplementedError):
+            distance_transform_edt(img, **kwargs_cucim)
+        return
+
+
+@pytest.mark.parametrize('value', [0, 1, 3])
+@pytest.mark.parametrize('ndim', [2, 3])
+def test_distance_transform_edt_uniform_valued(value, ndim):
+    """ensure default block_params is robust to anisotropic shape."""
+    img = cp.full((48, ) * ndim, value, dtype=cp.uint8)
+    # ensure there is at least 1 pixel at background intensity
+    img[(slice(24, 25),) * ndim] = 0
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('sx', list(range(16)))
+@pytest.mark.parametrize('sy', list(range(16)))
+def test_distance_transform_edt_2d_aniso(sx, sy):
+    """ensure default block_params is robust to anisotropic shape."""
+    shape = (128 + sy, 128 + sx)
+    img = binary_image(shape, pct_true=80)
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('sx', list(range(4)))
+@pytest.mark.parametrize('sy', list(range(4)))
+@pytest.mark.parametrize('sz', list(range(4)))
+def test_distance_transform_edt_3d_aniso(sx, sy, sz):
+    """ensure default block_params is robust to anisotropic shape."""
+    shape = (16 + sz, 32 + sy, 48 + sx)
+    img = binary_image(shape, pct_true=80)
+    out = distance_transform_edt(img)
+    expected = ndi_cpu.distance_transform_edt(cp.asnumpy(img))
+    cp.testing.assert_allclose(out, expected)
+
+
+@pytest.mark.parametrize('ndim', [1, 4, 5])
+def test_distance_transform_edt_unsupported_ndim(ndim):
+    with pytest.raises(NotImplementedError):
+        distance_transform_edt(cp.zeros((8,) * ndim))
+
+
+@pytest.mark.skip(reason="excessive memory requirement")
+def test_distance_transform_edt_3d_int64():
+    shape = (1280, 1280, 1280)
+    img = binary_image(shape, pct_true=80)
+    distance_transform_edt(img)
+    # Note: no validation vs. scipy.ndimage due to excessive run time
+    return
diff --git a/python/cucim/src/cucim/skimage/_shared/_gradient.py b/python/cucim/src/cucim/skimage/_shared/_gradient.py
new file mode 100644
index 000000000..2b0b67334
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_shared/_gradient.py
@@ -0,0 +1,121 @@
+"""
+Simplified version of cupy.gradient
+
+This version doesn't support non-unit spacing or 2nd order edges.
+
+Importantly, this version does not promote all integer dtypes to float64, but
+instead will promote 8 and 16-bit integer types to float32.
+"""
+import cupy
+
+from cucim.skimage._shared.utils import _supported_float_type
+
+
+def gradient(f, axis=None, output_as_array=False):
+    """Return the gradient of an N-dimensional array.
+
+    The gradient is computed using second order accurate central differences
+    in the interior points and either first or second order accurate one-sides
+    (forward or backwards) differences at the boundaries.
+    The returned gradient hence has the same shape as the input array.
+
+    Args:
+        f (cupy.ndarray): An N-dimensional array containing samples of a scalar
+            function.
+        axis (None or int or tuple of ints, optional): The gradient is
+            calculated only along the given axis or axes. The default
+            (axis = None) is to calculate the gradient for all the axes of the
+            input array. axis may be negative, in which case it counts from the
+            last to the first axis.
+        output_as_array
+
+    Returns:
+        gradient (cupy.ndarray or list of cupy.ndarray): A set of ndarrays
+        (or a single ndarray if there is only one dimension) corresponding
+        to the derivatives of f with respect to each dimension. Each
+        derivative has the same shape as f.
+
+    """
+    ndim = f.ndim  # number of dimensions
+    if axis is None:
+        axes = tuple(range(ndim))
+    else:
+        if cupy.isscalar(axis):
+            axis = (axis,)
+        for ax in axis:
+            if ax < -ndim or ax > ndim + 1:
+                raise ValueError(f"invalid axis: {ax}")
+        axes = tuple(ax + ndim if ax < 0 else ax for ax in axis)
+    len_axes = len(axes)
+
+    # use central differences on interior and one-sided differences on the
+    # endpoints. This preserves second order-accuracy over the full domain.
+
+    # create slice objects --- initially all are [:, :, ..., :]
+    slice1 = [slice(None)] * ndim
+    slice2 = [slice(None)] * ndim
+    slice3 = [slice(None)] * ndim
+    slice4 = [slice(None)] * ndim
+
+    otype = f.dtype
+    if cupy.issubdtype(otype, cupy.inexact):
+        pass
+    else:
+        # All other types convert to floating point.
+        float_dtype = _supported_float_type(otype)
+        if cupy.issubdtype(otype, cupy.integer):
+            f = f.astype(float_dtype)
+        otype = float_dtype
+
+    if output_as_array:
+        out = cupy.empty((ndim,) + f.shape, dtype=otype)
+        outvals = out
+    else:
+        outvals = []
+
+    for axis in axes:
+        if f.shape[axis] < 2:
+            raise ValueError(
+                "Shape of array too small to calculate a numerical gradient, "
+                "at least 2 elements are required."
+            )
+        # result allocation
+        if not output_as_array:
+            out = cupy.empty_like(f, dtype=otype)
+
+        # Numerical differentiation: 2nd order interior
+        slice1[axis] = slice(1, -1)
+        slice2[axis] = slice(None, -2)
+        slice3[axis] = slice(1, -1)
+        slice4[axis] = slice(2, None)
+
+        out_sl = (axis,) + tuple(slice1) if output_as_array else tuple(slice1)
+        out[out_sl] = (f[tuple(slice4)] - f[tuple(slice2)]) / 2.0
+
+        # Numerical differentiation: 1st order edges
+        slice1[axis] = 0
+        slice2[axis] = 1
+        slice3[axis] = 0
+        # 1D equivalent -- out[0] = (f[1] - f[0]) / (x[1] - x[0])
+        out_sl = (axis,) + tuple(slice1) if output_as_array else tuple(slice1)
+        out[out_sl] = f[tuple(slice2)] - f[tuple(slice3)]
+
+        slice1[axis] = -1
+        slice2[axis] = -1
+        slice3[axis] = -2
+        # 1D equivalent -- out[-1] = (f[-1] - f[-2]) / (x[-1] - x[-2])
+        out_sl = (axis,) + tuple(slice1) if output_as_array else tuple(slice1)
+        out[out_sl] = f[tuple(slice2)] - f[tuple(slice3)]
+        if not output_as_array:
+            outvals.append(out)
+
+        # reset the slice object in this dimension to ":"
+        slice1[axis] = slice(None)
+        slice2[axis] = slice(None)
+        slice3[axis] = slice(None)
+        slice4[axis] = slice(None)
+
+    if len_axes == 1:
+        return outvals[0]
+    else:
+        return outvals
diff --git a/python/cucim/src/cucim/skimage/_shared/filters.py b/python/cucim/src/cucim/skimage/_shared/filters.py
index 2f68221b4..56cee8000 100644
--- a/python/cucim/src/cucim/skimage/_shared/filters.py
+++ b/python/cucim/src/cucim/skimage/_shared/filters.py
@@ -7,7 +7,8 @@
 from collections.abc import Iterable
 
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
+
 
 from .._shared import utils
 from .._shared.utils import _supported_float_type, convert_to_float
diff --git a/python/cucim/src/cucim/skimage/_shared/tests/test_utils.py b/python/cucim/src/cucim/skimage/_shared/tests/test_utils.py
index f7135a0e3..c7d817b84 100644
--- a/python/cucim/src/cucim/skimage/_shared/tests/test_utils.py
+++ b/python/cucim/src/cucim/skimage/_shared/tests/test_utils.py
@@ -66,13 +66,13 @@ def bar(arg0, arg1=0, arg2=1):
 
 def test_deprecate_kwarg():
 
-    @deprecate_kwarg({'old_arg1': 'new_arg1'}, '0.19')
+    @deprecate_kwarg({'old_arg1': 'new_arg1'}, '22.02.00')
     def foo(arg0, new_arg1=1, arg2=None):
         """Expected docstring"""
         return arg0, new_arg1, arg2
 
     @deprecate_kwarg({'old_arg1': 'new_arg1'},
-                     deprecated_version='0.19',
+                     deprecated_version='22.02.00',
                      warning_msg="Custom warning message")
     def bar(arg0, new_arg1=1, arg2=None):
         """Expected docstring"""
@@ -116,7 +116,7 @@ def bar(arg0, new_arg1=1, arg2=None):
     old_arg1 : DEPRECATED
         Deprecated in favor of `new_arg1`.
 
-        .. deprecated:: 0.19
+        .. deprecated:: 22.02.00
 """
 
     assert len(recorded) == 0
diff --git a/python/cucim/src/cucim/skimage/_shared/utils.py b/python/cucim/src/cucim/skimage/_shared/utils.py
index 6798628c7..1e014ef3a 100644
--- a/python/cucim/src/cucim/skimage/_shared/utils.py
+++ b/python/cucim/src/cucim/skimage/_shared/utils.py
@@ -95,10 +95,11 @@ def __call__(self, func):
         parameters = inspect.signature(func).parameters
         arg_idx = list(parameters.keys()).index(self.arg_name)
         warning_msg = (
-            f"{self.arg_name} argument is deprecated and will be removed "
-            f"in version {self.changed_version}. To avoid this warning, "
-            f"please do not use the {self.arg_name} argument. Please "
-            f"see {func.__name__} documentation for more details.")
+            f"{self.arg_name} argument is deprecated in upstream scikit-image "
+            f"and will be removed in cuCIM {self.changed_version}. To avoid "
+            f"this warning, please do not use the {self.arg_name} argument. "
+            f"Please see {func.__name__} documentation for more details."
+        )
 
         if self.help_msg is not None:
             warning_msg += f" {self.help_msg}"
@@ -206,7 +207,7 @@ def __init__(self, kwarg_mapping, deprecated_version, warning_msg=None,
             self.warning_msg = ("`{old_arg}` is a deprecated argument name "
                                 "for `{func_name}`. ")
             if removed_version is not None:
-                self.warning_msg += (f'It will be removed in '
+                self.warning_msg += (f'It will be removed in cuCIM '
                                      f'version {removed_version}.')
             self.warning_msg += "Please use `{new_arg}` instead."
         else:
@@ -248,10 +249,11 @@ class deprecate_multichannel_kwarg(deprecate_kwarg):
 
     """
 
-    def __init__(self, removed_version='1.0', multichannel_position=None):
+    def __init__(self, removed_version='2023.02.00',
+                 multichannel_position=None):
         super().__init__(
             kwarg_mapping={'multichannel': 'channel_axis'},
-            deprecated_version='0.19',
+            deprecated_version='22.02.00',
             warning_msg=None,
             removed_version=removed_version)
         self.position = multichannel_position
@@ -293,7 +295,7 @@ def fixed_func(*args, **kwargs):
 
         if func.__doc__ is not None:
             newdoc = docstring_add_deprecated(
-                func, {'multichannel': 'channel_axis'}, '0.19')
+                func, {'multichannel': 'channel_axis'}, '22.02.00')
             fixed_func.__doc__ = newdoc
         return fixed_func
 
diff --git a/python/cucim/src/cucim/skimage/_vendored/_internal.py b/python/cucim/src/cucim/skimage/_vendored/_internal.py
index b6d163a36..196486d94 100644
--- a/python/cucim/src/cucim/skimage/_vendored/_internal.py
+++ b/python/cucim/src/cucim/skimage/_vendored/_internal.py
@@ -1,3 +1,7 @@
+import math
+from functools import reduce
+from operator import mul
+
 import cupy
 import numpy
 
@@ -61,3 +65,11 @@ def _normalize_axis_indices(axes, ndim):  # NOQA
             res.append(axis)
 
         return tuple(sorted(res))
+
+
+if hasattr(math, 'prod'):
+    prod = math.prod
+else:
+
+    def prod(iterable, *, start=1):
+        return reduce(mul, iterable, start)
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py
index 1ce475371..e4dc1311d 100644
--- a/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters.py
@@ -1,8 +1,224 @@
 """A vendored subset of cupyx.scipy.ndimage._filters"""
+import warnings
 
 import cupy
+import numpy
 
+from cucim.skimage._vendored import _internal as internal
 from cucim.skimage._vendored import _ndimage_filters_core as _filters_core
+from cucim.skimage._vendored import _ndimage_util as _util
+
+
+def correlate(input, weights, output=None, mode='reflect', cval=0.0, origin=0):
+    """Multi-dimensional correlate.
+
+    The array is correlated with the given kernel.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        weights (cupy.ndarray): Array of weights, same number of dimensions as
+            input
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of correlate.
+
+    .. seealso:: :func:`scipy.ndimage.correlate`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    return _correlate_or_convolve(input, weights, output, mode, cval, origin)
+
+
+def convolve(input, weights, output=None, mode='reflect', cval=0.0, origin=0):
+    """Multi-dimensional convolution.
+
+    The array is convolved with the given kernel.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        weights (cupy.ndarray): Array of weights, same number of dimensions as
+            input
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of convolution.
+
+    .. seealso:: :func:`scipy.ndimage.convolve`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    return _correlate_or_convolve(input, weights, output, mode, cval, origin,
+                                  True)
+
+
+def correlate1d(input, weights, axis=-1, output=None, mode="reflect", cval=0.0,
+                origin=0, *, algorithm=None):
+    """One-dimensional correlate.
+
+    The array is correlated with the given kernel.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        weights (cupy.ndarray): One-dimensional array of weights
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int): The origin parameter controls the placement of the
+            filter, relative to the center of the current element of the
+            input. Default is ``0``.
+
+    Returns:
+        cupy.ndarray: The result of the 1D correlation.
+
+    .. seealso:: :func:`scipy.ndimage.correlate1d`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    return _correlate_or_convolve1d(
+        input, weights, axis, output, mode, cval, origin, False, algorithm
+    )
+
+
+def convolve1d(input, weights, axis=-1, output=None, mode="reflect", cval=0.0,
+               origin=0, *, algorithm=None):
+    """One-dimensional convolution.
+
+    The array is convolved with the given kernel.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        weights (cupy.ndarray): One-dimensional array of weights
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int): The origin parameter controls the placement of the
+            filter, relative to the center of the current element of the
+            input. Default is ``0``.
+    Returns:
+        cupy.ndarray: The result of the 1D convolution.
+
+    .. seealso:: :func:`scipy.ndimage.convolve1d`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    return _correlate_or_convolve1d(
+        input, weights, axis, output, mode, cval, origin, True, algorithm
+    )
+
+
+def _correlate_or_convolve(input, weights, output, mode, cval, origin,
+                           convolution=False):
+    origins, int_type = _filters_core._check_nd_args(input, weights,
+                                                     mode, origin)
+    if weights.size == 0:
+        return cupy.zeros_like(input)
+
+    _util._check_cval(mode, cval, _util._is_integer_output(output, input))
+
+    if convolution:
+        weights = weights[tuple([slice(None, None, -1)] * weights.ndim)]
+        origins = list(origins)
+        for i, wsize in enumerate(weights.shape):
+            origins[i] = -origins[i]
+            if wsize % 2 == 0:
+                origins[i] -= 1
+        origins = tuple(origins)
+    elif weights.dtype.kind == "c":
+        # numpy.correlate conjugates weights rather than input.
+        weights = weights.conj()
+    weights_dtype = _util._get_weights_dtype(input, weights, use_cucim_casting=True)  # noqa
+    offsets = _filters_core._origins_to_offsets(origins, weights.shape)
+    kernel = _get_correlate_kernel(mode, weights.shape, int_type,
+                                   offsets, cval)
+    output = _filters_core._call_kernel(kernel, input, weights, output,
+                                        weights_dtype=weights_dtype)
+    return output
+
+
+def _correlate_or_convolve1d(input, weights, axis, output, mode, cval, origin,
+                             convolution=False, algorithm=None):
+    # Calls fast shared-memory convolution when possible, otherwise falls back
+    # to the vendored elementwise _correlate_or_convolve
+    if algorithm is None:
+        if input.ndim == 2 and weights.size <= 256:
+            algorithm = 'shared_memory'
+        else:
+            algorithm = 'elementwise'
+    elif algorithm not in ['shared_memory', 'elementwise']:
+        raise ValueError(
+            "algorithm must be 'shared_memory', 'elementwise' or None"
+        )
+    if mode == 'wrap':
+        mode = 'grid-wrap'
+    if algorithm == 'shared_memory':
+        from cucim.skimage.filters._separable_filtering import (
+            ResourceLimitError, _shmem_convolve1d)
+        if input.ndim not in [2, 3]:
+            raise NotImplementedError(
+                f"shared_memory not implemented for ndim={input.ndim}"
+            )
+        try:
+            out = _shmem_convolve1d(input, weights, axis=axis, output=output,
+                                    mode=mode, cval=cval, origin=origin,
+                                    convolution=convolution)
+            return out
+        except ResourceLimitError:
+            # fallback to elementwise if inadequate shared memory available
+            warnings.warn(
+                "Inadequate resources for algorithm='shared_memory: "
+                "falling back to the elementwise implementation"
+            )
+            algorithm = 'elementwise'
+    if algorithm == 'elementwise':
+        weights, origins = _filters_core._convert_1d_args(
+            input.ndim, weights, origin, axis
+        )
+        return _correlate_or_convolve(
+            input, weights, output, mode, cval, origins, convolution
+        )
 
 
 @cupy.memoize(for_each_device=True)
@@ -13,3 +229,949 @@ def _get_correlate_kernel(mode, w_shape, int_type, offsets, cval):
         'sum += cast<W>({value}) * wval;',
         'y = cast<Y>(sum);',
         mode, w_shape, int_type, offsets, cval, ctype='W')
+
+
+def _run_1d_correlates(input, params, get_weights, output, mode, cval,
+                       origin=0, **filter_kwargs):
+    """
+    Enhanced version of _run_1d_filters that uses correlate1d as the filter
+    function. The params are a list of values to pass to the get_weights
+    callable given. If duplicate param values are found, the weights are
+    reused from the first invocation of get_weights. The get_weights callable
+    must return a 1D array of weights to give to correlate1d.
+    """
+    wghts = {}
+    for param in params:
+        if param not in wghts:
+            wghts[param] = get_weights(param)
+    wghts = [wghts[param] for param in params]
+    return _filters_core._run_1d_filters(
+        [None if w is None else correlate1d for w in wghts],
+        input, wghts, output, mode, cval, origin, **filter_kwargs)
+
+
+def uniform_filter1d(input, size, axis=-1, output=None, mode="reflect",
+                     cval=0.0, origin=0, *, algorithm=None):
+    """One-dimensional uniform filter along the given axis.
+
+    The lines of the array along the given axis are filtered with a uniform
+    filter of the given size.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int): Length of the uniform filter.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int): The origin parameter controls the placement of the
+            filter, relative to the center of the current element of the
+            input. Default is ``0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.uniform_filter1d`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+    weights = cupy.full(size, 1 / size, dtype=weights_dtype)
+    return correlate1d(input, weights, axis, output, mode, cval,
+                       origin, algorithm=algorithm)
+
+
+def uniform_filter(input, size=3, output=None, mode="reflect", cval=0.0,
+                   origin=0, *, algorithm=None):
+    """Multi-dimensional uniform filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int or sequence of int): Lengths of the uniform filter for each
+            dimension. A single value applies to all axes.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of ``0`` is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.uniform_filter`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    sizes = _util._fix_sequence_arg(size, input.ndim, 'size', int)
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+
+    def get(size):
+        return None if size <= 1 else cupy.full(size, 1 / size, dtype=weights_dtype)  # noqa
+
+    return _run_1d_correlates(
+        input, sizes, get, output, mode, cval, origin, algorithm=algorithm
+    )
+
+
+def gaussian_filter1d(input, sigma, axis=-1, order=0, output=None,
+                      mode="reflect", cval=0.0, truncate=4.0, *,
+                      algorithm=None):
+    """One-dimensional Gaussian filter along the given axis.
+
+    The lines of the array along the given axis are filtered with a Gaussian
+    filter of the given standard deviation.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        sigma (scalar): Standard deviation for Gaussian kernel.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        order (int): An order of ``0``, the default, corresponds to convolution
+            with a Gaussian kernel. A positive order corresponds to convolution
+            with that derivative of a Gaussian.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        truncate (float): Truncate the filter at this many standard deviations.
+            Default is ``4.0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.gaussian_filter1d`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    radius = int(float(truncate) * float(sigma) + 0.5)
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+    weights = _gaussian_kernel1d(sigma, int(order), radius, weights_dtype)
+    return correlate1d(
+        input, weights, axis, output, mode, cval, algorithm=algorithm
+    )
+
+
+def gaussian_filter(input, sigma, order=0, output=None, mode="reflect",
+                    cval=0.0, truncate=4.0, *, algorithm=None):
+    """Multi-dimensional Gaussian filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        sigma (scalar or sequence of scalar): Standard deviations for each axis
+            of Gaussian kernel. A single value applies to all axes.
+        order (int or sequence of scalar): An order of ``0``, the default,
+            corresponds to convolution with a Gaussian kernel. A positive order
+            corresponds to convolution with that derivative of a Gaussian. A
+            single value applies to all axes.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        truncate (float): Truncate the filter at this many standard deviations.
+            Default is ``4.0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.gaussian_filter`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    sigmas = _util._fix_sequence_arg(sigma, input.ndim, 'sigma', float)
+    orders = _util._fix_sequence_arg(order, input.ndim, 'order', int)
+    truncate = float(truncate)
+    weights_dtype = cupy.promote_types(input, cupy.float32)
+
+    def get(param, dtype=weights_dtype):
+        sigma, order = param
+        radius = int(truncate * float(sigma) + 0.5)
+        if radius <= 0:
+            return None
+        return _gaussian_kernel1d(sigma, order, radius, dtype)
+
+    return _run_1d_correlates(input, list(zip(sigmas, orders)), get, output,
+                              mode, cval, 0, algorithm=algorithm)
+
+
+def _gaussian_kernel1d(sigma, order, radius, dtype=cupy.float64):
+    """
+    Computes a 1-D Gaussian correlation kernel.
+    """
+    if order < 0:
+        raise ValueError('order must be non-negative')
+    sigma2 = sigma * sigma
+    x = numpy.arange(-radius, radius + 1)
+    phi_x = numpy.exp(-0.5 / sigma2 * x ** 2)
+    phi_x /= phi_x.sum()
+
+    if order == 0:
+        return cupy.asarray(phi_x)
+
+    # f(x) = q(x) * phi(x) = q(x) * exp(p(x))
+    # f'(x) = (q'(x) + q(x) * p'(x)) * phi(x)
+    # p'(x) = -1 / sigma ** 2
+    # Implement q'(x) + q(x) * p'(x) as a matrix operator and apply to the
+    # coefficients of q(x)
+    exponent_range = numpy.arange(order + 1)
+    q = numpy.zeros(order + 1)
+    q[0] = 1
+    D = numpy.diag(exponent_range[1:], 1)  # D @ q(x) = q'(x)
+    P = numpy.diag(numpy.ones(order) / -sigma2, -1)  # P @ q(x) = q(x) * p'(x)
+    Q_deriv = D + P
+    for _ in range(order):
+        q = Q_deriv.dot(q)
+    q = (x[:, None] ** exponent_range).dot(q)
+    return cupy.asarray((q * phi_x)[::-1], order='C', dtype=dtype)
+
+
+def prewitt(input, axis=-1, output=None, mode="reflect", cval=0.0, *,
+            algorithm=None):
+    """Compute a Prewitt filter along the given axis.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.prewitt`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+    smooth = cupy.ones(3, dtype=weights_dtype)
+    return _prewitt_or_sobel(
+        input, axis, output, mode, cval, smooth, algorithm
+    )
+
+
+def sobel(input, axis=-1, output=None, mode="reflect", cval=0.0, *,
+          algorithm=None):
+    """Compute a Sobel filter along the given axis.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.sobel`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+    smooth = cupy.array([1, 2, 1], dtype=weights_dtype)
+    return _prewitt_or_sobel(
+        input, axis, output, mode, cval, smooth, algorithm
+    )
+
+
+def _prewitt_or_sobel(input, axis, output, mode, cval, weights, algorithm):
+    axis = internal._normalize_axis_index(axis, input.ndim)
+
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+
+    def get(is_diff, dtype=weights_dtype):
+        return cupy.array([-1, 0, 1], dtype=dtype) if is_diff else weights  # noqa
+
+    return _run_1d_correlates(input, [a == axis for a in range(input.ndim)],
+                              get, output, mode, cval, algorithm=algorithm)
+
+
+def generic_laplace(input, derivative2, output=None, mode="reflect",
+                    cval=0.0, extra_arguments=(), extra_keywords=None):
+    """Multi-dimensional Laplace filter using a provided second derivative
+    function.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        derivative2 (callable): Function or other callable with the following
+            signature that is called once per axis::
+
+                derivative2(input, axis, output, mode, cval,
+                            *extra_arguments, **extra_keywords)
+
+            where ``input`` and ``output`` are ``cupy.ndarray``, ``axis`` is an
+            ``int`` from ``0`` to the number of dimensions, and ``mode``,
+            ``cval``, ``extra_arguments``, ``extra_keywords`` are the values
+            given to this function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        extra_arguments (sequence, optional):
+            Sequence of extra positional arguments to pass to ``derivative2``.
+        extra_keywords (dict, optional):
+            dict of extra keyword arguments to pass ``derivative2``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.generic_laplace`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    if extra_keywords is None:
+        extra_keywords = {}
+    ndim = input.ndim
+    modes = _util._fix_sequence_arg(mode, ndim, 'mode',
+                                    _util._check_mode)
+    output = _util._get_output(output, input)
+    if ndim == 0:
+        output[:] = input
+        return output
+    derivative2(input, 0, output, modes[0], cval,
+                *extra_arguments, **extra_keywords)
+    if ndim > 1:
+        tmp = _util._get_output(output.dtype, input)
+        for i in range(1, ndim):
+            derivative2(input, i, tmp, modes[i], cval,
+                        *extra_arguments, **extra_keywords)
+            output += tmp
+    return output
+
+
+def laplace(input, output=None, mode="reflect", cval=0.0, *, algorithm=None):
+    """Multi-dimensional Laplace filter based on approximate second
+    derivatives.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.laplace`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    weights_dtype = cupy.promote_types(input.dtype, cupy.float32)
+    weights = cupy.array([1, -2, 1], dtype=weights_dtype)
+
+    def derivative2(input, axis, output, mode, cval):
+        return correlate1d(
+            input, weights, axis, output, mode, cval, algorithm=algorithm
+        )
+
+    return generic_laplace(input, derivative2, output, mode, cval)
+
+
+def gaussian_laplace(input, sigma, output=None, mode="reflect",
+                     cval=0.0, *, algorithm=None, **kwargs):
+    """Multi-dimensional Laplace filter using Gaussian second derivatives.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        sigma (scalar or sequence of scalar): Standard deviations for each axis
+            of Gaussian kernel. A single value applies to all axes.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        kwargs (dict, optional):
+            dict of extra keyword arguments to pass ``gaussian_filter()``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.gaussian_laplace`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    def derivative2(input, axis, output, mode, cval):
+        order = [0] * input.ndim
+        order[axis] = 2
+        return gaussian_filter(input, sigma, order, output, mode, cval,
+                               algorithm=algorithm, **kwargs)
+    return generic_laplace(input, derivative2, output, mode, cval)
+
+
+def generic_gradient_magnitude(input, derivative, output=None,
+                               mode="reflect", cval=0.0,
+                               extra_arguments=(), extra_keywords=None):
+    """Multi-dimensional gradient magnitude filter using a provided derivative
+    function.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        derivative (callable): Function or other callable with the following
+            signature that is called once per axis::
+
+                derivative(input, axis, output, mode, cval,
+                           *extra_arguments, **extra_keywords)
+
+            where ``input`` and ``output`` are ``cupy.ndarray``, ``axis`` is an
+            ``int`` from ``0`` to the number of dimensions, and ``mode``,
+            ``cval``, ``extra_arguments``, ``extra_keywords`` are the values
+            given to this function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        extra_arguments (sequence, optional):
+            Sequence of extra positional arguments to pass to ``derivative2``.
+        extra_keywords (dict, optional):
+            dict of extra keyword arguments to pass ``derivative2``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.generic_gradient_magnitude`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    if extra_keywords is None:
+        extra_keywords = {}
+    ndim = input.ndim
+    modes = _util._fix_sequence_arg(mode, ndim, 'mode',
+                                    _util._check_mode)
+    output = _util._get_output(output, input)
+    if ndim == 0:
+        output[:] = input
+        return output
+    derivative(input, 0, output, modes[0], cval,
+               *extra_arguments, **extra_keywords)
+    output *= output
+    if ndim > 1:
+        tmp = _util._get_output(output.dtype, input)
+        for i in range(1, ndim):
+            derivative(input, i, tmp, modes[i], cval,
+                       *extra_arguments, **extra_keywords)
+            tmp *= tmp
+            output += tmp
+    return cupy.sqrt(output, output, casting='unsafe')
+
+
+def gaussian_gradient_magnitude(input, sigma, output=None, mode="reflect",
+                                cval=0.0, *, algorithm=None, **kwargs):
+    """Multi-dimensional gradient magnitude using Gaussian derivatives.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        sigma (scalar or sequence of scalar): Standard deviations for each axis
+            of Gaussian kernel. A single value applies to all axes.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        kwargs (dict, optional):
+            dict of extra keyword arguments to pass ``gaussian_filter()``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.gaussian_gradient_magnitude`
+
+    .. note::
+        When the output data type is integral (or when no output is provided
+        and input is integral) the results may not perfectly match the results
+        from SciPy due to floating-point rounding of intermediate results.
+    """
+    def derivative(input, axis, output, mode, cval):
+        order = [0] * input.ndim
+        order[axis] = 1
+        return gaussian_filter(input, sigma, order, output, mode, cval,
+                               algorithm=algorithm, **kwargs)
+    return generic_gradient_magnitude(input, derivative, output, mode, cval)
+
+
+def minimum_filter(input, size=None, footprint=None, output=None,
+                   mode="reflect", cval=0.0, origin=0):
+    """Multi-dimensional minimum filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int or sequence of int): One of ``size`` or ``footprint`` must be
+            provided. If ``footprint`` is given, ``size`` is ignored. Otherwise
+            ``footprint = cupy.ones(size)`` with ``size`` automatically made to
+            match the number of dimensions in ``input``.
+        footprint (cupy.ndarray): a boolean array which specifies which of the
+            elements within this shape will get passed to the filter function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.minimum_filter`
+    """
+    return _min_or_max_filter(input, size, footprint, None, output, mode,
+                              cval, origin, 'min')
+
+
+def maximum_filter(input, size=None, footprint=None, output=None,
+                   mode="reflect", cval=0.0, origin=0):
+    """Multi-dimensional maximum filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int or sequence of int): One of ``size`` or ``footprint`` must be
+            provided. If ``footprint`` is given, ``size`` is ignored. Otherwise
+            ``footprint = cupy.ones(size)`` with ``size`` automatically made to
+            match the number of dimensions in ``input``.
+        footprint (cupy.ndarray): a boolean array which specifies which of the
+            elements within this shape will get passed to the filter function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.maximum_filter`
+    """
+    return _min_or_max_filter(input, size, footprint, None, output, mode,
+                              cval, origin, 'max')
+
+
+def _min_or_max_filter(input, size, ftprnt, structure, output, mode, cval,
+                       origin, func):
+    # structure is used by morphology.grey_erosion() and grey_dilation()
+    # and not by the regular min/max filters
+
+    sizes, ftprnt, structure = _filters_core._check_size_footprint_structure(
+        input.ndim, size, ftprnt, structure)
+    if cval is cupy.nan:
+        raise NotImplementedError("NaN cval is unsupported")
+
+    if sizes is not None:
+        # Seperable filter, run as a series of 1D filters
+        fltr = minimum_filter1d if func == 'min' else maximum_filter1d
+        return _filters_core._run_1d_filters(
+            [fltr if size > 1 else None for size in sizes],
+            input, sizes, output, mode, cval, origin)
+
+    origins, int_type = _filters_core._check_nd_args(input, ftprnt,
+                                                     mode, origin, 'footprint')
+    if structure is not None and structure.ndim != input.ndim:
+        raise RuntimeError('structure array has incorrect shape')
+
+    if ftprnt.size == 0:
+        return cupy.zeros_like(input)
+    offsets = _filters_core._origins_to_offsets(origins, ftprnt.shape)
+    kernel = _get_min_or_max_kernel(mode, ftprnt.shape, func,
+                                    offsets, float(cval), int_type,
+                                    has_structure=structure is not None,
+                                    has_central_value=bool(ftprnt[offsets]))
+    return _filters_core._call_kernel(kernel, input, ftprnt, output,
+                                      structure, weights_dtype=bool)
+
+
+def minimum_filter1d(input, size, axis=-1, output=None, mode="reflect",
+                     cval=0.0, origin=0):
+    """Compute the minimum filter along a single axis.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int): Length of the minimum filter.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int): The origin parameter controls the placement of the
+            filter, relative to the center of the current element of the
+            input. Default is ``0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.minimum_filter1d`
+    """
+    return _min_or_max_1d(input, size, axis, output, mode, cval, origin, 'min')
+
+
+def maximum_filter1d(input, size, axis=-1, output=None, mode="reflect",
+                     cval=0.0, origin=0):
+    """Compute the maximum filter along a single axis.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int): Length of the maximum filter.
+        axis (int): The axis of input along which to calculate. Default is -1.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int): The origin parameter controls the placement of the
+            filter, relative to the center of the current element of the
+            input. Default is ``0``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.maximum_filter1d`
+    """
+    return _min_or_max_1d(input, size, axis, output, mode, cval, origin, 'max')
+
+
+def _min_or_max_1d(input, size, axis=-1, output=None, mode="reflect", cval=0.0,
+                   origin=0, func='min'):
+    ftprnt = cupy.ones(size, dtype=bool)
+    ftprnt, origin = _filters_core._convert_1d_args(input.ndim, ftprnt,
+                                                    origin, axis)
+    origins, int_type = _filters_core._check_nd_args(input, ftprnt,
+                                                     mode, origin, 'footprint')
+    offsets = _filters_core._origins_to_offsets(origins, ftprnt.shape)
+    kernel = _get_min_or_max_kernel(mode, ftprnt.shape, func, offsets,
+                                    float(cval), int_type, has_weights=False)
+    return _filters_core._call_kernel(kernel, input, None, output,
+                                      weights_dtype=bool)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_min_or_max_kernel(mode, w_shape, func, offsets, cval, int_type,
+                           has_weights=True, has_structure=False,
+                           has_central_value=True):
+    # When there are no 'weights' (the footprint, for the 1D variants) then
+    # we need to make sure intermediate results are stored as doubles for
+    # consistent results with scipy.
+    ctype = 'X' if has_weights else 'double'
+    value = '{value}'
+    if not has_weights:
+        value = 'cast<double>({})'.format(value)
+
+    # Having a non-flat structure biases the values
+    if has_structure:
+        value += ('-' if func == 'min' else '+') + 'cast<X>(sval)'
+
+    if has_central_value:
+        pre = '{} value = x[i];'
+        found = 'value = {func}({value}, value);'
+    else:
+        # If the central pixel is not included in the footprint we cannot
+        # assume `x[i]` is not below the min or above the max and thus cannot
+        # seed with that value. Instead we keep track of having set `value`.
+        pre = '{} value; bool set = false;'
+        found = 'value = set ? {func}({value}, value) : {value}; set=true;'
+
+    return _filters_core._generate_nd_kernel(
+        func, pre.format(ctype),
+        found.format(func=func, value=value), 'y = cast<Y>(value);',
+        mode, w_shape, int_type, offsets, cval, ctype=ctype,
+        has_weights=has_weights, has_structure=has_structure)
+
+
+def rank_filter(input, rank, size=None, footprint=None, output=None,
+                mode="reflect", cval=0.0, origin=0):
+    """Multi-dimensional rank filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        rank (int): The rank of the element to get. Can be negative to count
+            from the largest value, e.g. ``-1`` indicates the largest value.
+        size (int or sequence of int): One of ``size`` or ``footprint`` must be
+            provided. If ``footprint`` is given, ``size`` is ignored. Otherwise
+            ``footprint = cupy.ones(size)`` with ``size`` automatically made to
+            match the number of dimensions in ``input``.
+        footprint (cupy.ndarray): a boolean array which specifies which of the
+            elements within this shape will get passed to the filter function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.rank_filter`
+    """
+    rank = int(rank)
+    return _rank_filter(input, lambda fs: rank + fs if rank < 0 else rank,
+                        size, footprint, output, mode, cval, origin)
+
+
+def median_filter(input, size=None, footprint=None, output=None,
+                  mode="reflect", cval=0.0, origin=0):
+    """Multi-dimensional median filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (int or sequence of int): One of ``size`` or ``footprint`` must be
+            provided. If ``footprint`` is given, ``size`` is ignored. Otherwise
+            ``footprint = cupy.ones(size)`` with ``size`` automatically made to
+            match the number of dimensions in ``input``.
+        footprint (cupy.ndarray): a boolean array which specifies which of the
+            elements within this shape will get passed to the filter function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.median_filter`
+    """
+    return _rank_filter(input, lambda fs: fs // 2,
+                        size, footprint, output, mode, cval, origin)
+
+
+def percentile_filter(input, percentile, size=None, footprint=None,
+                      output=None, mode="reflect", cval=0.0, origin=0):
+    """Multi-dimensional percentile filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        percentile (scalar): The percentile of the element to get (from ``0``
+            to ``100``). Can be negative, thus ``-20`` equals ``80``.
+        size (int or sequence of int): One of ``size`` or ``footprint`` must be
+            provided. If ``footprint`` is given, ``size`` is ignored. Otherwise
+            ``footprint = cupy.ones(size)`` with ``size`` automatically made to
+            match the number of dimensions in ``input``.
+        footprint (cupy.ndarray): a boolean array which specifies which of the
+            elements within this shape will get passed to the filter function.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output. Default is is same dtype as the input.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``'constant'``. Default is ``0.0``.
+        origin (int or sequence of int): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of the filtering.
+
+    .. seealso:: :func:`scipy.ndimage.percentile_filter`
+    """
+    percentile = float(percentile)
+    if percentile < 0.0:
+        percentile += 100.0
+    if percentile < 0.0 or percentile > 100.0:
+        raise RuntimeError('invalid percentile')
+    if percentile == 100.0:
+        def get_rank(fs):
+            return fs - 1
+    else:
+        def get_rank(fs):
+            return int(float(fs) * percentile / 100.0)
+    return _rank_filter(input, get_rank,
+                        size, footprint, output, mode, cval, origin)
+
+
+def _rank_filter(input, get_rank, size=None, footprint=None, output=None,
+                 mode="reflect", cval=0.0, origin=0):
+    _, footprint, _ = _filters_core._check_size_footprint_structure(
+        input.ndim, size, footprint, None, force_footprint=True)
+    if cval is cupy.nan:
+        raise NotImplementedError("NaN cval is unsupported")
+    origins, int_type = _filters_core._check_nd_args(input, footprint,
+                                                     mode, origin, 'footprint')
+    if footprint.size == 0:
+        return cupy.zeros_like(input)
+    filter_size = int(footprint.sum())
+    rank = get_rank(filter_size)
+    if rank < 0 or rank >= filter_size:
+        raise RuntimeError('rank not within filter footprint size')
+    if rank == 0:
+        return _min_or_max_filter(input, None, footprint, None, output, mode,
+                                  cval, origins, 'min')
+    if rank == filter_size - 1:
+        return _min_or_max_filter(input, None, footprint, None, output, mode,
+                                  cval, origins, 'max')
+    offsets = _filters_core._origins_to_offsets(origins, footprint.shape)
+    kernel = _get_rank_kernel(filter_size, rank, mode, footprint.shape,
+                              offsets, float(cval), int_type)
+    return _filters_core._call_kernel(kernel, input, footprint, output,
+                                      weights_dtype=bool)
+
+
+__SHELL_SORT = '''
+__device__ void sort(X *array, int size) {{
+    int gap = {gap};
+    while (gap > 1) {{
+        gap /= 3;
+        for (int i = gap; i < size; ++i) {{
+            X value = array[i];
+            int j = i - gap;
+            while (j >= 0 && value < array[j]) {{
+                array[j + gap] = array[j];
+                j -= gap;
+            }}
+            array[j + gap] = value;
+        }}
+    }}
+}}'''
+
+
+@cupy._util.memoize()
+def _get_shell_gap(filter_size):
+    gap = 1
+    while gap < filter_size:
+        gap = 3 * gap + 1
+    return gap
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_rank_kernel(filter_size, rank, mode, w_shape, offsets, cval,
+                     int_type):
+    s_rank = min(rank, filter_size - rank - 1)
+    # The threshold was set based on the measurements on a V100
+    # TODO(leofang, anaruse): Use Optuna to automatically tune the threshold,
+    # as it may vary depending on the GPU in use, compiler version, dtype,
+    # filter size, etc.
+    if s_rank <= 80:
+        # When s_rank is small and register usage is low, this partial
+        # selection sort approach is faster than general sorting approach
+        # using shell sort.
+        if s_rank == rank:
+            comp_op = '<'
+        else:
+            comp_op = '>'
+        array_size = s_rank + 2
+        found_post = '''
+            if (iv > {rank} + 1) {{{{
+                int target_iv = 0;
+                X target_val = values[0];
+                for (int jv = 1; jv <= {rank} + 1; jv++) {{{{
+                    if (target_val {comp_op} values[jv]) {{{{
+                        target_val = values[jv];
+                        target_iv = jv;
+                    }}}}
+                }}}}
+                if (target_iv <= {rank}) {{{{
+                    values[target_iv] = values[{rank} + 1];
+                }}}}
+                iv = {rank} + 1;
+            }}}}'''.format(rank=s_rank, comp_op=comp_op)
+        post = '''
+            X target_val = values[0];
+            for (int jv = 1; jv <= {rank}; jv++) {{
+                if (target_val {comp_op} values[jv]) {{
+                    target_val = values[jv];
+                }}
+            }}
+            y=cast<Y>(target_val);'''.format(rank=s_rank, comp_op=comp_op)
+        sorter = ''
+    else:
+        array_size = filter_size
+        found_post = ''
+        post = 'sort(values,{});\ny=cast<Y>(values[{}]);'.format(
+            filter_size, rank)
+        sorter = __SHELL_SORT.format(gap=_get_shell_gap(filter_size))
+
+    return _filters_core._generate_nd_kernel(
+        'rank_{}_{}'.format(filter_size, rank),
+        'int iv = 0;\nX values[{}];'.format(array_size),
+        'values[iv++] = {value};' + found_post, post,
+        mode, w_shape, int_type, offsets, cval, preamble=sorter)
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters_core.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters_core.py
index 60aec68cf..469dff9d9 100644
--- a/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters_core.py
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_filters_core.py
@@ -1,24 +1,172 @@
 """A vendored subset of cupyx.scipy.ndimage._filters_core"""
+import warnings
 
 import cupy
 import numpy
 
+from cucim.skimage._vendored import _internal as internal
 from cucim.skimage._vendored import _ndimage_util as _util
 
-includes = r'''
-// workaround for HIP: line begins with #include
+
+def _origins_to_offsets(origins, w_shape):
+    return tuple(x // 2 + o for x, o in zip(w_shape, origins))
+
+
+def _check_size_footprint_structure(ndim, size, footprint, structure,
+                                    stacklevel=3, force_footprint=False):
+    if structure is None and footprint is None:
+        if size is None:
+            raise RuntimeError("no footprint or filter size provided")
+        sizes = _util._fix_sequence_arg(size, ndim, 'size', int)
+        if force_footprint:
+            return None, cupy.ones(sizes, bool), None
+        return sizes, None, None
+    if size is not None:
+        warnings.warn("ignoring size because {} is set".format(
+            'structure' if footprint is None else 'footprint'),
+            UserWarning, stacklevel=stacklevel + 1)
+
+    if footprint is not None:
+        footprint = cupy.array(footprint, bool, True, 'C')
+        if not footprint.any():
+            raise ValueError("all-zero footprint is not supported")
+
+    if structure is None:
+        if not force_footprint and footprint.all():
+            if footprint.ndim != ndim:
+                raise RuntimeError("size must have length equal to input rank")
+            return footprint.shape, None, None
+        return None, footprint, None
+
+    structure = cupy.ascontiguousarray(structure)
+    if footprint is None:
+        footprint = cupy.ones(structure.shape, bool)
+    return None, footprint, structure
+
+
+def _convert_1d_args(ndim, weights, origin, axis):
+    if weights.ndim != 1 or weights.size < 1:
+        raise RuntimeError('incorrect filter size')
+    axis = internal._normalize_axis_index(axis, ndim)
+    w_shape = [1] * ndim
+    w_shape[axis] = weights.size
+    weights = weights.reshape(w_shape)
+    origins = [0] * ndim
+    origins[axis] = _util._check_origin(origin, weights.size)
+    return weights, tuple(origins)
+
+
+def _check_nd_args(input, weights, mode, origin, wghts_name='filter weights'):
+    _util._check_mode(mode)
+    # Weights must always be less than 2 GiB
+    if weights.nbytes >= (1 << 31):
+        raise RuntimeError('weights must be 2 GiB or less, use FFTs instead')
+    weight_dims = [x for x in weights.shape if x != 0]
+    if len(weight_dims) != input.ndim:
+        raise RuntimeError('{} array has incorrect shape'.format(wghts_name))
+    origins = _util._fix_sequence_arg(origin, len(weight_dims), 'origin', int)
+    for origin, width in zip(origins, weight_dims):
+        _util._check_origin(origin, width)
+    return tuple(origins), _util._get_inttype(input)
+
+
+def _run_1d_filters(filters, input, args, output, mode, cval, origin=0,
+                    **filter_kwargs):
+    """
+    Runs a series of 1D filters forming an nd filter. The filters must be a
+    list of callables that take input, arg, axis, output, mode, cval, origin.
+    The args is a list of values that are passed for the arg value to the
+    filter. Individual filters can be None causing that axis to be skipped.
+    """
+    output = _util._get_output(output, input)
+    modes = _util._fix_sequence_arg(mode, input.ndim, 'mode',
+                                    _util._check_mode)
+    # for filters, "wrap" is a synonym for "grid-wrap".
+    modes = ['grid-wrap' if m == 'wrap' else m for m in modes]
+    origins = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)
+    n_filters = sum(filter is not None for filter in filters)
+    if n_filters == 0:
+        output[:] = input
+        return output
+    # We can't operate in-place efficiently, so use a 2-buffer system
+    temp = _util._get_output(output.dtype, input) if n_filters > 1 else None   # noqa
+    iterator = zip(filters, args, modes, origins)
+    for axis, (fltr, arg, mode, origin) in enumerate(iterator):
+        if fltr is None:
+            continue
+        else:
+            break
+    if n_filters % 2 == 0:
+        fltr(input, arg, axis, temp, mode, cval, origin, **filter_kwargs)
+        input = temp
+    else:
+        fltr(input, arg, axis, output, mode, cval, origin, **filter_kwargs)
+        if n_filters == 1:
+            return output
+        input, output = output, temp
+    for axis, (fltr, arg, mode, origin) in enumerate(iterator, start=axis + 1):
+        if fltr is None:
+            continue
+        fltr(input, arg, axis, output, mode, cval, origin, **filter_kwargs)
+        input, output = output, input
+    return input
+
+
+def _call_kernel(kernel, input, weights, output, structure=None,
+                 weights_dtype=numpy.float64, structure_dtype=numpy.float64):
+    """
+    Calls a constructed ElementwiseKernel. The kernel must take an input image,
+    an optional array of weights, an optional array for the structure, and an
+    output array.
+
+    weights and structure can be given as None (structure defaults to None) in
+    which case they are not passed to the kernel at all. If the output is given
+    as None then it will be allocated in this function.
+
+    This function deals with making sure that the weights and structure are
+    contiguous and float64 (or bool for weights that are footprints)*, that the
+    output is allocated and appriopately shaped. This also deals with the
+    situation that the input and output arrays overlap in memory.
+
+    * weights is always cast to float64 or bool in order to get an output
+    compatible with SciPy, though float32 might be sufficient when input dtype
+    is low precision. If weights_dtype is passed as weights.dtype then no
+    dtype conversion will occur. The input and output are never converted.
+    """
+    args = [input]
+    complex_output = input.dtype.kind == 'c'
+    if weights is not None:
+        weights = cupy.ascontiguousarray(weights, weights_dtype)
+        complex_output = complex_output or weights.dtype.kind == 'c'
+        args.append(weights)
+    if structure is not None:
+        structure = cupy.ascontiguousarray(structure, structure_dtype)
+        args.append(structure)
+    output = _util._get_output(output, input, None, complex_output)  # noqa
+    needs_temp = cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS')
+    if needs_temp:
+        output, temp = _util._get_output(output.dtype, input, None, complex_output), output  # noqa
+    args.append(output)
+    kernel(*args)
+    if needs_temp:
+        output[:] = temp
+        output = temp
+    return output
+
+
+_ndimage_includes = r'''
 #include <type_traits>  // let Jitify handle this
 #include <cupy/math_constants.h>
-'''
 
-
-_CAST_FUNCTION = """
-// Implements a casting function to make it compatible with scipy
-// Use like cast<to_type>(value)
 template<> struct std::is_floating_point<float16> : std::true_type {};
 template<> struct std::is_signed<float16> : std::true_type {};
 template<class T> struct std::is_signed<complex<T>> : std::is_signed<T> {};
+'''
 
+
+_ndimage_CAST_FUNCTION = """
+// Implements a casting function to make it compatible with scipy
+// Use like cast<to_type>(value)
 template <class B, class A>
 __device__ __forceinline__
 typename std::enable_if<(!std::is_floating_point<A>::value
@@ -148,7 +296,7 @@ def _generate_nd_kernel(name, pre, found, post, mode, w_shape, int_type,
         name += '_with_structure'
     if has_mask:
         name += '_with_mask'
-    preamble = includes + _CAST_FUNCTION + preamble
+    preamble = _ndimage_includes + _ndimage_CAST_FUNCTION + preamble
     options += ('--std=c++11', '-DCUPY_USE_JITIFY')
     return cupy.ElementwiseKernel(in_params, out_params, operation, name,
                                   reduce_dims=False, preamble=preamble,
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_interp_kernels.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interp_kernels.py
new file mode 100644
index 000000000..fa65eed36
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interp_kernels.py
@@ -0,0 +1,598 @@
+import cupy
+import numpy
+
+from cucim.skimage._vendored import \
+    _ndimage_spline_kernel_weights as _spline_kernel_weights
+from cucim.skimage._vendored import \
+    _ndimage_spline_prefilter_core as _spline_prefilter_core
+from cucim.skimage._vendored import _ndimage_util as _util
+
+math_constants_preamble = r'''
+// workaround for HIP: line begins with #include
+#include <cupy/math_constants.h>
+'''
+
+spline_weights_inline = _spline_kernel_weights.spline_weights_inline
+
+
+def _get_coord_map(ndim, nprepad=0):
+    """Extract target coordinate from coords array (for map_coordinates).
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        coords (ndarray): array of shape (ncoords, ndim) containing the target
+            coordinates.
+        c_j: variables to hold the target coordinates
+
+    computes::
+
+        c_j = coords[i + j * ncoords];
+
+    ncoords is determined by the size of the output array, y.
+    y will be indexed by the CIndexer, _ind.
+    Thus ncoords = _ind.size();
+
+    """
+    ops = []
+    ops.append('ptrdiff_t ncoords = _ind.size();')
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = coords[i + {j} * ncoords]{pre};''')
+    return ops
+
+
+def _get_coord_zoom_and_shift(ndim, nprepad=0):
+    """Compute target coordinate based on a shift followed by a zoom.
+
+    This version zooms from the center of the edge pixels.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        in_coord[ndim]: array containing the source coordinate
+        zoom[ndim]: array containing the zoom for each axis
+        shift[ndim]: array containing the zoom for each axis
+
+    computes::
+
+        c_j = zoom[j] * (in_coord[j] - shift[j])
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = zoom[{j}] * ((W)in_coord[{j}] - shift[{j}]){pre};''')
+    return ops
+
+
+def _get_coord_zoom_and_shift_grid(ndim, nprepad=0):
+    """Compute target coordinate based on a shift followed by a zoom.
+
+    This version zooms from the outer edges of the grid pixels.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        in_coord[ndim]: array containing the source coordinate
+        zoom[ndim]: array containing the zoom for each axis
+        shift[ndim]: array containing the zoom for each axis
+
+    computes::
+
+        c_j = zoom[j] * (in_coord[j] - shift[j] + 0.5) - 0.5
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = zoom[{j}] * ((W)in_coord[{j}] - shift[j] + 0.5) - 0.5{pre};''')
+    return ops
+
+
+def _get_coord_zoom(ndim, nprepad=0):
+    """Compute target coordinate based on a zoom.
+
+    This version zooms from the center of the edge pixels.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        in_coord[ndim]: array containing the source coordinate
+        zoom[ndim]: array containing the zoom for each axis
+
+    computes::
+
+        c_j = zoom[j] * in_coord[j]
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = zoom[{j}] * (W)in_coord[{j}]{pre};''')
+    return ops
+
+
+def _get_coord_zoom_grid(ndim, nprepad=0):
+    """Compute target coordinate based on a zoom (grid_mode=True version).
+
+    This version zooms from the outer edges of the grid pixels.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        in_coord[ndim]: array containing the source coordinate
+        zoom[ndim]: array containing the zoom for each axis
+
+    computes::
+
+        c_j = zoom[j] * (in_coord[j] + 0.5) - 0.5
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = zoom[{j}] * ((W)in_coord[{j}] + 0.5) - 0.5{pre};''')
+    return ops
+
+
+def _get_coord_shift(ndim, nprepad=0):
+    """Compute target coordinate based on a shift.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        in_coord[ndim]: array containing the source coordinate
+        shift[ndim]: array containing the zoom for each axis
+
+    computes::
+
+        c_j = in_coord[j] - shift[j]
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    for j in range(ndim):
+        ops.append(f'''
+    W c_{j} = (W)in_coord[{j}] - shift[{j}]{pre};''')
+    return ops
+
+
+def _get_coord_affine(ndim, nprepad=0):
+    """Compute target coordinate based on a homogeneous transformation matrix.
+
+    The homogeneous matrix has shape (ndim, ndim + 1). It corresponds to
+    affine matrix where the last row of the affine is assumed to be:
+    ``[0] * ndim + [1]``.
+
+    Notes
+    -----
+    Assumes the following variables have been initialized on the device::
+
+        mat(array): array containing the (ndim, ndim + 1) transform matrix.
+        in_coords(array): coordinates of the input
+
+    For example, in 2D:
+
+        c_0 = mat[0] * in_coords[0] + mat[1] * in_coords[1] + aff[2];
+        c_1 = mat[3] * in_coords[0] + mat[4] * in_coords[1] + aff[5];
+
+    """
+    ops = []
+    pre = f" + (W){nprepad}" if nprepad > 0 else ''
+    ncol = ndim + 1
+    for j in range(ndim):
+        ops.append(f'''
+            W c_{j} = (W)0.0;''')
+        for k in range(ndim):
+            ops.append(f'''
+            c_{j} += mat[{ncol * j + k}] * (W)in_coord[{k}];''')
+        ops.append(f'''
+            c_{j} += mat[{ncol * j + ndim}]{pre};''')
+    return ops
+
+
+def _unravel_loop_index(shape, uint_t='unsigned int'):
+    """
+    declare a multi-index array in_coord and unravel the 1D index, i into it.
+    This code assumes that the array is a C-ordered array.
+    """
+    ndim = len(shape)
+    code = [f'''
+        {uint_t} in_coord[{ndim}];
+        {uint_t} s, t, idx = i;''']
+    for j in range(ndim - 1, 0, -1):
+        code.append(f'''
+        s = {shape[j]};
+        t = idx / s;
+        in_coord[{j}] = idx - t * s;
+        idx = t;''')
+    code.append('''
+        in_coord[0] = idx;''')
+    return '\n'.join(code)
+
+
+def _generate_interp_custom(coord_func, ndim, large_int, yshape, mode, cval,
+                            order, name='', integer_output=False, nprepad=0,
+                            omit_in_coord=False):
+    """
+    Args:
+        coord_func (function): generates code to do the coordinate
+            transformation. See for example, `_get_coord_shift`.
+        ndim (int): The number of dimensions.
+        large_int (bool): If true use Py_ssize_t instead of int for indexing.
+        yshape (tuple): Shape of the output array.
+        mode (str): Signal extension mode to use at the array boundaries
+        cval (float): constant value used when `mode == 'constant'`.
+        name (str): base name for the interpolation kernel
+        integer_output (bool): boolean indicating whether the output has an
+            integer type.
+        nprepad (int): integer indicating the amount of prepadding at the
+            boundaries.
+
+    Returns:
+        operation (str): code body for the ElementwiseKernel
+        name (str): name for the ElementwiseKernel
+    """
+
+    ops = []
+    internal_dtype = 'double' if integer_output else 'Y'
+    ops.append(f'{internal_dtype} out = 0.0;')
+
+    if large_int:
+        uint_t = 'size_t'
+        int_t = 'ptrdiff_t'
+    else:
+        uint_t = 'unsigned int'
+        int_t = 'int'
+
+    # determine strides for x along each axis
+    for j in range(ndim):
+        ops.append(f'const {int_t} xsize_{j} = x.shape()[{j}];')
+    ops.append(f'const {uint_t} sx_{ndim - 1} = 1;')
+    for j in range(ndim - 1, 0, -1):
+        ops.append(f'const {uint_t} sx_{j - 1} = sx_{j} * xsize_{j};')
+
+    if not omit_in_coord:
+        # create in_coords array to store the unraveled indices
+        ops.append(_unravel_loop_index(yshape, uint_t))
+
+    # compute the transformed (target) coordinates, c_j
+    ops = ops + coord_func(ndim, nprepad)
+
+    if cval is numpy.nan:
+        cval = '(Y)CUDART_NAN'
+    elif cval == numpy.inf:
+        cval = '(Y)CUDART_INF'
+    elif cval == -numpy.inf:
+        cval = '(Y)(-CUDART_INF)'
+    else:
+        cval = f'({internal_dtype}){cval}'
+
+    if mode == 'constant':
+        # use cval if coordinate is outside the bounds of x
+        _cond = ' || '.join(
+            [f'(c_{j} < 0) || (c_{j} > xsize_{j} - 1)' for j in range(ndim)])
+        ops.append(f'''
+        if ({_cond})
+        {{
+            out = {cval};
+        }}
+        else
+        {{''')
+
+    if order == 0:
+        if mode == 'wrap':
+            ops.append('double dcoord;')  # mode 'wrap' requires this to work
+        for j in range(ndim):
+            # determine nearest neighbor
+            if mode == 'wrap':
+                ops.append(f'''
+                dcoord = c_{j};''')
+            else:
+                ops.append(f'''
+                {int_t} cf_{j} = ({int_t})floor((double)c_{j} + 0.5);''')
+
+            # handle boundary
+            if mode != 'constant':
+                if mode == 'wrap':
+                    ixvar = 'dcoord'
+                    float_ix = True
+                else:
+                    ixvar = f'cf_{j}'
+                    float_ix = False
+                ops.append(
+                    _util._generate_boundary_condition_ops(
+                        mode, ixvar, f'xsize_{j}', int_t, float_ix))
+                if mode == 'wrap':
+                    ops.append(f'''
+                {int_t} cf_{j} = ({int_t})floor(dcoord + 0.5);''')
+
+            # sum over ic_j will give the raveled coordinate in the input
+            ops.append(f'''
+            {int_t} ic_{j} = cf_{j} * sx_{j};''')
+        _coord_idx = ' + '.join([f'ic_{j}' for j in range(ndim)])
+        if mode == 'grid-constant':
+            _cond = ' || '.join([f'(ic_{j} < 0)' for j in range(ndim)])
+            ops.append(f'''
+            if ({_cond}) {{
+                out = {cval};
+            }} else {{
+                out = ({internal_dtype})x[{_coord_idx}];
+            }}''')
+        else:
+            ops.append(f'''
+            out = ({internal_dtype})x[{_coord_idx}];''')
+
+    elif order == 1:
+        for j in range(ndim):
+            # get coordinates for linear interpolation along axis j
+            ops.append(f'''
+            {int_t} cf_{j} = ({int_t})floor((double)c_{j});
+            {int_t} cc_{j} = cf_{j} + 1;
+            {int_t} n_{j} = (c_{j} == cf_{j}) ? 1 : 2;  // points needed
+            ''')
+
+            if mode == 'wrap':
+                ops.append(f'''
+                double dcoordf = c_{j};
+                double dcoordc = c_{j} + 1;''')
+            else:
+                # handle boundaries for extension modes.
+                ops.append(f'''
+                {int_t} cf_bounded_{j} = cf_{j};
+                {int_t} cc_bounded_{j} = cc_{j};''')
+
+            if mode != 'constant':
+                if mode == 'wrap':
+                    ixvar = 'dcoordf'
+                    float_ix = True
+                else:
+                    ixvar = f'cf_bounded_{j}'
+                    float_ix = False
+                ops.append(
+                    _util._generate_boundary_condition_ops(
+                        mode, ixvar, f'xsize_{j}', int_t, float_ix))
+
+                ixvar = 'dcoordc' if mode == 'wrap' else f'cc_bounded_{j}'
+                ops.append(
+                    _util._generate_boundary_condition_ops(
+                        mode, ixvar, f'xsize_{j}', int_t, float_ix))
+                if mode == 'wrap':
+                    ops.append(
+                        f'''
+                    {int_t} cf_bounded_{j} = ({int_t})floor(dcoordf);;
+                    {int_t} cc_bounded_{j} = ({int_t})floor(dcoordf + 1);;
+                    '''
+                    )
+
+            ops.append(f'''
+            for (int s_{j} = 0; s_{j} < n_{j}; s_{j}++)
+                {{
+                    W w_{j};
+                    {int_t} ic_{j};
+                    if (s_{j} == 0)
+                    {{
+                        w_{j} = (W)cc_{j} - c_{j};
+                        ic_{j} = cf_bounded_{j} * sx_{j};
+                    }} else
+                    {{
+                        w_{j} = c_{j} - (W)cf_{j};
+                        ic_{j} = cc_bounded_{j} * sx_{j};
+                    }}''')
+    elif order > 1:
+        if mode == 'grid-constant':
+            spline_mode = 'constant'
+        elif mode == 'nearest':
+            spline_mode = 'nearest'
+        else:
+            spline_mode = _spline_prefilter_core._get_spline_mode(mode)
+
+        # wx, wy are temporary variables used during spline weight computation
+        ops.append(f'''
+            W wx, wy;
+            {int_t} start;''')
+        for j in range(ndim):
+            # determine weights along the current axis
+            ops.append(f'''
+            W weights_{j}[{order + 1}];''')
+            ops.append(spline_weights_inline[order].format(j=j, order=order))
+
+            # get starting coordinate for spline interpolation along axis j
+            if mode in ['wrap']:
+                ops.append(f'double dcoord = c_{j};')
+                coord_var = 'dcoord'
+                ops.append(
+                    _util._generate_boundary_condition_ops(
+                        mode, coord_var, f'xsize_{j}', int_t, True))
+            else:
+                coord_var = f'(double)c_{j}'
+
+            if order & 1:
+                op_str = '''
+                start = ({int_t})floor({coord_var}) - {order_2};'''
+            else:
+                op_str = '''
+                start = ({int_t})floor({coord_var} + 0.5) - {order_2};'''
+            ops.append(
+                op_str.format(
+                    int_t=int_t, coord_var=coord_var, order_2=order // 2
+                ))
+
+            # set of coordinate values within spline footprint along axis j
+            ops.append(f'''{int_t} ci_{j}[{order + 1}];''')
+            for k in range(order + 1):
+                ixvar = f'ci_{j}[{k}]'
+                ops.append(f'''
+                {ixvar} = start + {k};''')
+                ops.append(
+                    _util._generate_boundary_condition_ops(
+                        spline_mode, ixvar, f'xsize_{j}', int_t))
+
+            # loop over the order + 1 values in the spline filter
+            ops.append(f'''
+            W w_{j};
+            {int_t} ic_{j};
+            for (int k_{j} = 0; k_{j} <= {order}; k_{j}++)
+                {{
+                    w_{j} = weights_{j}[k_{j}];
+                    ic_{j} = ci_{j}[k_{j}] * sx_{j};
+            ''')
+
+    if order > 0:
+
+        _weight = ' * '.join([f'w_{j}' for j in range(ndim)])
+        _coord_idx = ' + '.join([f'ic_{j}' for j in range(ndim)])
+        if mode == 'grid-constant' or (order > 1 and mode == 'constant'):
+            _cond = ' || '.join([f'(ic_{j} < 0)' for j in range(ndim)])
+            ops.append(f'''
+            if ({_cond}) {{
+                out += {cval} * ({internal_dtype})({_weight});
+            }} else {{
+                {internal_dtype} val = ({internal_dtype})x[{_coord_idx}];
+                out += val * ({internal_dtype})({_weight});
+            }}''')
+        else:
+            ops.append(f'''
+            {internal_dtype} val = ({internal_dtype})x[{_coord_idx}];
+            out += val * ({internal_dtype})({_weight});''')
+
+        ops.append('}' * ndim)
+
+    if mode == 'constant':
+        ops.append('}')
+
+    if integer_output:
+        ops.append('y = (Y)rint((double)out);')
+    else:
+        ops.append('y = (Y)out;')
+    operation = '\n'.join(ops)
+
+    mode_str = mode.replace('-', '_')  # avoid hyphen in kernel name
+    name = 'cupyx_scipy_ndimage_interpolate_{}_order{}_{}_{}d_y{}'.format(
+        name, order, mode_str, ndim, '_'.join([f'{j}' for j in yshape]),
+    )
+    if uint_t == 'size_t':
+        name += '_i64'
+    return operation, name
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_map_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1,
+                    integer_output=False, nprepad=0):
+    in_params = 'raw X x, raw W coords'
+    out_params = 'Y y'
+    operation, name = _generate_interp_custom(
+        coord_func=_get_coord_map,
+        ndim=ndim,
+        large_int=large_int,
+        yshape=yshape,
+        mode=mode,
+        cval=cval,
+        order=order,
+        name='map',
+        integer_output=integer_output,
+        nprepad=nprepad,
+        omit_in_coord=True,  # input image coordinates are not needed
+    )
+    return cupy.ElementwiseKernel(in_params, out_params, operation, name,
+                                  preamble=math_constants_preamble)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_shift_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1,
+                      integer_output=False, nprepad=0):
+    in_params = 'raw X x, raw W shift'
+    out_params = 'Y y'
+    operation, name = _generate_interp_custom(
+        coord_func=_get_coord_shift,
+        ndim=ndim,
+        large_int=large_int,
+        yshape=yshape,
+        mode=mode,
+        cval=cval,
+        order=order,
+        name='shift',
+        integer_output=integer_output,
+        nprepad=nprepad,
+    )
+    return cupy.ElementwiseKernel(in_params, out_params, operation, name,
+                                  preamble=math_constants_preamble)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_zoom_shift_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1,
+                           integer_output=False, grid_mode=False, nprepad=0):
+    in_params = 'raw X x, raw W shift, raw W zoom'
+    out_params = 'Y y'
+    if grid_mode:
+        zoom_shift_func = _get_coord_zoom_and_shift_grid
+    else:
+        zoom_shift_func = _get_coord_zoom_and_shift
+    operation, name = _generate_interp_custom(
+        coord_func=zoom_shift_func,
+        ndim=ndim,
+        large_int=large_int,
+        yshape=yshape,
+        mode=mode,
+        cval=cval,
+        order=order,
+        name="zoom_shift_grid" if grid_mode else "zoom_shift",
+        integer_output=integer_output,
+        nprepad=nprepad,
+    )
+    return cupy.ElementwiseKernel(in_params, out_params, operation, name,
+                                  preamble=math_constants_preamble)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_zoom_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1,
+                     integer_output=False, grid_mode=False, nprepad=0):
+    in_params = 'raw X x, raw W zoom'
+    out_params = 'Y y'
+    operation, name = _generate_interp_custom(
+        coord_func=_get_coord_zoom_grid if grid_mode else _get_coord_zoom,
+        ndim=ndim,
+        large_int=large_int,
+        yshape=yshape,
+        mode=mode,
+        cval=cval,
+        order=order,
+        name="zoom_grid" if grid_mode else "zoom",
+        integer_output=integer_output,
+        nprepad=nprepad,
+    )
+    return cupy.ElementwiseKernel(in_params, out_params, operation, name,
+                                  preamble=math_constants_preamble)
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_affine_kernel(ndim, large_int, yshape, mode, cval=0.0, order=1,
+                       integer_output=False, nprepad=0):
+    in_params = 'raw X x, raw W mat'
+    out_params = 'Y y'
+    operation, name = _generate_interp_custom(
+        coord_func=_get_coord_affine,
+        ndim=ndim,
+        large_int=large_int,
+        yshape=yshape,
+        mode=mode,
+        cval=cval,
+        order=order,
+        name='affine',
+        integer_output=integer_output,
+        nprepad=nprepad,
+    )
+    return cupy.ElementwiseKernel(in_params, out_params, operation, name,
+                                  preamble=math_constants_preamble)
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py
new file mode 100644
index 000000000..18275213b
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py
@@ -0,0 +1,757 @@
+import cmath
+import math
+import warnings
+
+import cupy
+import numpy
+from cupy import _core
+from cupy.cuda import runtime
+
+from cucim.skimage._vendored import _ndimage_interp_kernels as _interp_kernels
+from cucim.skimage._vendored import \
+    _ndimage_spline_prefilter_core as _spline_prefilter_core
+from cucim.skimage._vendored import _ndimage_util as _util
+from cucim.skimage._vendored._internal import _normalize_axis_index, prod
+
+
+def _check_parameter(func_name, order, mode):
+    if order is None:
+        warnings.warn(f'Currently the default order of {func_name} is 1. In a '
+                      'future release this may change to 3 to match '
+                      'scipy.ndimage ')
+    elif order < 0 or 5 < order:
+        raise ValueError('spline order is not supported')
+
+    if mode not in ('constant', 'grid-constant', 'nearest', 'mirror',
+                    'reflect', 'grid-mirror', 'wrap', 'grid-wrap', 'opencv',
+                    '_opencv_edge'):
+        raise ValueError('boundary mode ({}) is not supported'.format(mode))
+
+
+def _get_spline_output(input, output):
+    """Create workspace array, temp, and the final dtype for the output.
+
+    Differs from SciPy by not always forcing the internal floating point dtype
+    to be double precision.
+    """
+    complex_data = input.dtype.kind == 'c'
+    if complex_data:
+        min_float_dtype = cupy.complex64
+    else:
+        min_float_dtype = cupy.float32
+    if isinstance(output, cupy.ndarray):
+        if complex_data and output.dtype.kind != 'c':
+            raise ValueError(
+                'output must have complex dtype for complex inputs'
+            )
+        float_dtype = cupy.promote_types(output.dtype, min_float_dtype)
+        output_dtype = output.dtype
+    else:
+        if output is None:
+            output = output_dtype = input.dtype
+        else:
+            output_dtype = cupy.dtype(output)
+        float_dtype = cupy.promote_types(output, min_float_dtype)
+
+    if (isinstance(output, cupy.ndarray)
+            and output.dtype == float_dtype == output_dtype
+            and output.flags.c_contiguous):
+        if output is not input:
+            _core.elementwise_copy(input, output)
+        temp = output
+    else:
+        temp = input.astype(float_dtype, copy=False)
+        temp = cupy.ascontiguousarray(temp)
+        if cupy.shares_memory(temp, input, 'MAY_SHARE_BOUNDS'):
+            temp = temp.copy()
+    return temp, float_dtype, output_dtype
+
+
+def spline_filter1d(input, order=3, axis=-1, output=cupy.float64,
+                    mode='mirror'):
+    """
+    Calculate a 1-D spline filter along the given axis.
+
+    The lines of the array along the given axis are filtered by a
+    spline filter. The order of the spline must be >= 2 and <= 5.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        axis (int): The axis along which the spline filter is applied. Default
+            is the last axis.
+        output (cupy.ndarray or dtype, optional): The array in which to place
+            the output, or the dtype of the returned array. Default is
+            ``numpy.float64``.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+
+    Returns:
+        cupy.ndarray: The result of prefiltering the input.
+
+    .. seealso:: :func:`scipy.spline_filter1d`
+    """
+    if order < 0 or order > 5:
+        raise RuntimeError('spline order not supported')
+    x = input
+    ndim = x.ndim
+    axis = _normalize_axis_index(axis, ndim)
+
+    # order 0, 1 don't require reshaping as no CUDA kernel will be called
+    # scalar or size 1 arrays also don't need to be filtered
+    run_kernel = not (order < 2 or x.ndim == 0 or x.shape[axis] == 1)
+    if not run_kernel:
+        output = _util._get_output(output, input)
+        _core.elementwise_copy(x, output)
+        return output
+
+    temp, data_dtype, output_dtype = _get_spline_output(x, output)
+    data_type = cupy._core._scalar.get_typename(temp.dtype)
+    pole_type = cupy._core._scalar.get_typename(temp.real.dtype)
+
+    index_type = _util._get_inttype(input)
+    index_dtype = cupy.int32 if index_type == 'int' else cupy.int64
+
+    n_samples = x.shape[axis]
+    n_signals = x.size // n_samples
+    info = cupy.array((n_signals, n_samples) + x.shape, dtype=index_dtype)
+
+    # empirical choice of block size that seemed to work well
+    block_size = max(2 ** math.ceil(numpy.log2(n_samples / 32)), 8)
+    kern = _spline_prefilter_core.get_raw_spline1d_kernel(
+        axis,
+        ndim,
+        mode,
+        order=order,
+        index_type=index_type,
+        data_type=data_type,
+        pole_type=pole_type,
+        block_size=block_size,
+    )
+
+    # Due to recursive nature, a given line of data must be processed by a
+    # single thread. n_signals lines will be processed in total.
+    block = (block_size,)
+    grid = ((n_signals + block[0] - 1) // block[0],)
+
+    # apply prefilter gain
+    poles = _spline_prefilter_core.get_poles(order=order)
+    temp *= _spline_prefilter_core.get_gain(poles)
+
+    # apply caual + anti-causal IIR spline filters
+    kern(grid, block, (temp, info))
+
+    if isinstance(output, cupy.ndarray) and temp is not output:
+        # copy kernel output into the user-provided output array
+        _core.elementwise_copy(temp, output)
+        return output
+    return temp.astype(output_dtype, copy=False)
+
+
+def spline_filter(input, order=3, output=cupy.float64, mode='mirror'):
+    """Multidimensional spline filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        output (cupy.ndarray or dtype, optional): The array in which to place
+            the output, or the dtype of the returned array. Default is
+            ``numpy.float64``.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+
+    Returns:
+        cupy.ndarray: The result of prefiltering the input.
+
+    .. seealso:: :func:`scipy.spline_filter1d`
+    """
+    if order < 2 or order > 5:
+        raise RuntimeError('spline order not supported')
+
+    x = input
+    temp, data_dtype, output_dtype = _get_spline_output(x, output)
+    if order not in [0, 1] and input.ndim > 0:
+        for axis in range(x.ndim):
+            spline_filter1d(x, order, axis, output=temp, mode=mode)
+            x = temp
+    if isinstance(output, cupy.ndarray):
+        _core.elementwise_copy(temp, output)
+    else:
+        output = temp
+    if output.dtype != output_dtype:
+        output = output.astype(output_dtype)
+    return output
+
+
+def _check_coordinates(coordinates, order, allow_float32=True):
+    if coordinates.dtype.kind == 'f':
+        if allow_float32:
+            coord_dtype = cupy.promote_types(coordinates.dtype, cupy.float32)
+        else:
+            coord_dtype = cupy.promote_types(coordinates.dtype, cupy.float64)
+        coordinates = coordinates.astype(coord_dtype, copy=False)
+    elif coordinates.dtype.kind in 'iu':
+        if order > 1:
+            # order > 1 (spline) kernels require floating-point coordinates
+            if allow_float32:
+                coord_dtype = cupy.promote_types(
+                    coordinates.dtype, cupy.float32
+                )
+            else:
+                coord_dtype = cupy.promote_types(
+                    coordinates.dtype, cupy.float64
+                )
+            coordinates = coordinates.astype(coord_dtype)
+    else:
+        raise ValueError('coordinates should have floating point dtype')
+    if not coordinates.flags.c_contiguous:
+        coordinates = cupy.ascontiguousarray(coordinates)
+    return coordinates
+
+
+def _prepad_for_spline_filter(input, mode, cval):
+    if mode in ['nearest', 'grid-constant']:
+        # these modes need padding to get accurate boundary values
+        npad = 12  # empirical factor chosen by SciPy
+        if mode == 'grid-constant':
+            kwargs = dict(mode='constant', constant_values=cval)
+        else:
+            kwargs = dict(mode='edge')
+        padded = cupy.pad(input, npad, **kwargs)
+    else:
+        npad = 0
+        padded = input
+    return padded, npad
+
+
+def _filter_input(image, prefilter, mode, cval, order):
+    """Perform spline prefiltering when needed.
+
+    Spline orders > 1 need a prefiltering stage to preserve resolution.
+
+    For boundary modes without analytical spline boundary conditions, some
+    prepadding of the input with cupy.pad is used to maintain accuracy.
+    ``npad`` is an integer corresponding to the amount of padding at each edge
+    of the array.
+    """
+    if not prefilter or order < 2:
+        return (cupy.ascontiguousarray(image), 0)
+    padded, npad = _prepad_for_spline_filter(image, mode, cval)
+    float_dtype = cupy.promote_types(image.dtype, cupy.float32)
+    filtered = spline_filter(padded, order, output=float_dtype, mode=mode)
+    return cupy.ascontiguousarray(filtered), npad
+
+
+def map_coordinates(input, coordinates, output=None, order=3,
+                    mode='constant', cval=0.0, prefilter=True):
+    """Map the input array to new coordinates by interpolation.
+
+    The array of coordinates is used to find, for each point in the output, the
+    corresponding coordinates in the input. The value of the input at those
+    coordinates is determined by spline interpolation of the requested order.
+
+    The shape of the output is derived from that of the coordinate array by
+    dropping the first axis. The values of the array along the first axis are
+    the coordinates in the input array at which the output value is found.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        coordinates (array_like): The coordinates at which ``input`` is
+            evaluated.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+        cval (scalar): Value used for points outside the boundaries of
+            the input if ``mode='constant'`` or ``mode='opencv'``. Default is
+            0.0
+        prefilter (bool): It is not used yet. It just exists for compatibility
+            with :mod:`scipy.ndimage`.
+
+    Returns:
+        cupy.ndarray:
+            The result of transforming the input. The shape of the output is
+            derived from that of ``coordinates`` by dropping the first axis.
+
+    .. seealso:: :func:`scipy.ndimage.map_coordinates`
+    """
+
+    _check_parameter('map_coordinates', order, mode)
+
+    if mode == 'opencv' or mode == '_opencv_edge':
+        input = cupy.pad(input, [(1, 1)] * input.ndim, 'constant',
+                         constant_values=cval)
+        coordinates = cupy.add(coordinates, 1)
+        mode = 'constant'
+
+    ret = _util._get_output(output, input, coordinates.shape[1:])
+    integer_output = ret.dtype.kind in 'iu'
+    _util._check_cval(mode, cval, integer_output)
+
+    if input.dtype.kind in 'iu':
+        input = input.astype(cupy.float32)
+    coordinates = _check_coordinates(coordinates, order)
+    filtered, nprepad = _filter_input(input, prefilter, mode, cval, order)
+    large_int = max(prod(input.shape), coordinates.shape[0]) > 1 << 31
+    kern = _interp_kernels._get_map_kernel(
+        input.ndim, large_int, yshape=coordinates.shape, mode=mode, cval=cval,
+        order=order, integer_output=integer_output, nprepad=nprepad)
+    kern(filtered, coordinates, ret)
+    return ret
+
+
+def affine_transform(input, matrix, offset=0.0, output_shape=None, output=None,
+                     order=3, mode='constant', cval=0.0, prefilter=True, *,
+                     texture_memory=False):
+    """Apply an affine transformation.
+
+    Given an output image pixel index vector ``o``, the pixel value is
+    determined from the input image at position
+    ``cupy.dot(matrix, o) + offset``.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        matrix (cupy.ndarray): The inverse coordinate transformation matrix,
+            mapping output coordinates to input coordinates. If ``ndim`` is the
+            number of dimensions of ``input``, the given matrix must have one
+            of the following shapes:
+
+                - ``(ndim, ndim)``: the linear transformation matrix for each
+                  output coordinate.
+                - ``(ndim,)``: assume that the 2D transformation matrix is
+                  diagonal, with the diagonal specified by the given value.
+                - ``(ndim + 1, ndim + 1)``: assume that the transformation is
+                  specified using homogeneous coordinates. In this case, any
+                  value passed to ``offset`` is ignored.
+                - ``(ndim, ndim + 1)``: as above, but the bottom row of a
+                  homogeneous transformation matrix is always
+                  ``[0, 0, ..., 1]``, and may be omitted.
+
+        offset (float or sequence): The offset into the array where the
+            transform is applied. If a float, ``offset`` is the same for each
+            axis. If a sequence, ``offset`` should contain one value for each
+            axis.
+        output_shape (tuple of ints): Shape tuple.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+        cval (scalar): Value used for points outside the boundaries of
+            the input if ``mode='constant'`` or ``mode='opencv'``. Default is
+            0.0
+        prefilter (bool): It is not used yet. It just exists for compatibility
+            with :mod:`scipy.ndimage`.
+        texture_memory (bool): If True, uses GPU texture memory. Supports only:
+
+            - 2D and 3D float32 arrays as input
+            - ``(ndim + 1, ndim + 1)`` homogeneous float32 transformation
+                matrix
+            - ``mode='constant'`` and ``mode='nearest'``
+            - ``order=0`` (nearest neighbor) and ``order=1`` (linear
+                interpolation)
+            - NVIDIA CUDA GPUs
+
+    Returns:
+        cupy.ndarray or None:
+            The transformed input. If ``output`` is given as a parameter,
+            ``None`` is returned.
+
+    .. seealso:: :func:`scipy.ndimage.affine_transform`
+    """
+
+    if texture_memory:
+        # _texture only available in CuPy 10.x so delay the import
+        # We do not use this texture-based implementation in cuCIM.
+        from cucim.skimage._vendored import _texture
+
+        if runtime.is_hip:
+            raise RuntimeError(
+                'HIP currently does not support texture acceleration')
+        tm_interp = 'linear' if order > 0 else 'nearest'
+        return _texture.affine_transformation(data=input,
+                                              transformation_matrix=matrix,
+                                              output_shape=output_shape,
+                                              output=output,
+                                              interpolation=tm_interp,
+                                              mode=mode,
+                                              border_value=cval)
+
+    _check_parameter('affine_transform', order, mode)
+
+    offset = _util._fix_sequence_arg(offset, input.ndim, 'offset', float)
+
+    if matrix.ndim not in [1, 2] or matrix.shape[0] < 1:
+        raise RuntimeError('no proper affine matrix provided')
+    if matrix.ndim == 2:
+        if matrix.shape[0] == matrix.shape[1] - 1:
+            offset = matrix[:, -1]
+            matrix = matrix[:, :-1]
+        elif matrix.shape[0] == input.ndim + 1:
+            offset = matrix[:-1, -1]
+            matrix = matrix[:-1, :-1]
+        if matrix.shape != (input.ndim, input.ndim):
+            raise RuntimeError('improper affine shape')
+
+    if mode == 'opencv':
+        m = cupy.zeros((input.ndim + 1, input.ndim + 1))
+        m[:-1, :-1] = matrix
+        m[:-1, -1] = offset
+        m[-1, -1] = 1
+        m = cupy.linalg.inv(m)
+        m[:2] = cupy.roll(m[:2], 1, axis=0)
+        m[:2, :2] = cupy.roll(m[:2, :2], 1, axis=1)
+        matrix = m[:-1, :-1]
+        offset = m[:-1, -1]
+
+    if output_shape is None:
+        output_shape = input.shape
+
+    if mode == 'opencv' or mode == '_opencv_edge':
+        if matrix.ndim == 1:
+            matrix = cupy.diag(matrix)
+        coordinates = cupy.indices(output_shape, dtype=cupy.float64)
+        coordinates = cupy.dot(matrix, coordinates.reshape((input.ndim, -1)))
+        coordinates += cupy.expand_dims(cupy.asarray(offset), -1)
+        ret = _util._get_output(output, input, shape=output_shape)
+        ret[:] = map_coordinates(input, coordinates, ret.dtype, order, mode,
+                                 cval, prefilter).reshape(output_shape)
+        return ret
+
+    matrix = matrix.astype(cupy.float64, copy=False)
+    ndim = input.ndim
+    output = _util._get_output(output, input, shape=output_shape)
+    if input.dtype.kind in 'iu':
+        input = input.astype(cupy.float32)
+    filtered, nprepad = _filter_input(input, prefilter, mode, cval, order)
+
+    integer_output = output.dtype.kind in 'iu'
+    _util._check_cval(mode, cval, integer_output)
+    large_int = max(prod(input.shape), prod(output_shape)) > 1 << 31
+    if matrix.ndim == 1:
+        offset = cupy.asarray(offset, dtype=cupy.float64)
+        offset = -offset / matrix
+        kern = _interp_kernels._get_zoom_shift_kernel(
+            ndim, large_int, output_shape, mode, cval=cval, order=order,
+            integer_output=integer_output, nprepad=nprepad)
+        kern(filtered, offset, matrix, output)
+    else:
+        kern = _interp_kernels._get_affine_kernel(
+            ndim, large_int, output_shape, mode, cval=cval, order=order,
+            integer_output=integer_output, nprepad=nprepad)
+        m = cupy.zeros((ndim, ndim + 1), dtype=cupy.float64)
+        m[:, :-1] = matrix
+        m[:, -1] = cupy.asarray(offset, dtype=cupy.float64)
+        kern(filtered, m, output)
+    return output
+
+
+def _minmax(coor, minc, maxc):
+    if coor[0] < minc[0]:
+        minc[0] = coor[0]
+    elif coor[0] > maxc[0]:
+        maxc[0] = coor[0]
+    if coor[1] < minc[1]:
+        minc[1] = coor[1]
+    elif coor[1] > maxc[1]:
+        maxc[1] = coor[1]
+    return minc, maxc
+
+
+def rotate(input, angle, axes=(1, 0), reshape=True, output=None, order=3,
+           mode='constant', cval=0.0, prefilter=True):
+    """Rotate an array.
+
+    The array is rotated in the plane defined by the two axes given by the
+    ``axes`` parameter using spline interpolation of the requested order.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        angle (float): The rotation angle in degrees.
+        axes (tuple of 2 ints): The two axes that define the plane of rotation.
+            Default is the first two axes.
+        reshape (bool): If ``reshape`` is True, the output shape is adapted so
+            that the input array is contained completely in the output. Default
+            is True.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+        cval (scalar): Value used for points outside the boundaries of
+            the input if ``mode='constant'`` or ``mode='opencv'``. Default is
+            0.0
+        prefilter (bool): It is not used yet. It just exists for compatibility
+            with :mod:`scipy.ndimage`.
+
+    Returns:
+        cupy.ndarray or None:
+            The rotated input.
+
+    .. seealso:: :func:`scipy.ndimage.rotate`
+    """
+
+    _check_parameter('rotate', order, mode)
+
+    if mode == 'opencv':
+        mode = '_opencv_edge'
+
+    input_arr = input
+    axes = list(axes)
+    if axes[0] < 0:
+        axes[0] += input_arr.ndim
+    if axes[1] < 0:
+        axes[1] += input_arr.ndim
+    if axes[0] > axes[1]:
+        axes = [axes[1], axes[0]]
+    if axes[0] < 0 or input_arr.ndim <= axes[1]:
+        raise ValueError('invalid rotation plane specified')
+
+    ndim = input_arr.ndim
+    rad = math.radians(angle)
+    sincos = cmath.rect(1, rad)
+    cos, sin = sincos.real, sincos.imag
+
+    # determine offsets and output shape as in scipy.ndimage.rotate
+    rot_matrix = numpy.array([[cos, sin],
+                              [-sin, cos]])
+
+    img_shape = numpy.asarray(input_arr.shape)
+    in_plane_shape = img_shape[axes]
+    if reshape:
+        # Compute transformed input bounds
+        iy, ix = in_plane_shape
+        out_bounds = rot_matrix @ [[0, 0, iy, iy],
+                                   [0, ix, 0, ix]]
+        # Compute the shape of the transformed input plane
+        out_plane_shape = (out_bounds.ptp(axis=1) + 0.5).astype(cupy.int64)
+    else:
+        out_plane_shape = img_shape[axes]
+
+    out_center = rot_matrix @ ((out_plane_shape - 1) / 2)
+    in_center = (in_plane_shape - 1) / 2
+
+    output_shape = img_shape
+    output_shape[axes] = out_plane_shape
+    output_shape = tuple(output_shape)
+
+    matrix = numpy.identity(ndim)
+    matrix[axes[0], axes[0]] = cos
+    matrix[axes[0], axes[1]] = sin
+    matrix[axes[1], axes[0]] = -sin
+    matrix[axes[1], axes[1]] = cos
+
+    offset = numpy.zeros(ndim, dtype=cupy.float64)
+    offset[axes] = in_center - out_center
+
+    matrix = cupy.asarray(matrix)
+    offset = cupy.asarray(offset)
+
+    return affine_transform(input, matrix, offset, output_shape, output, order,
+                            mode, cval, prefilter)
+
+
+def shift(input, shift, output=None, order=3, mode='constant', cval=0.0,
+          prefilter=True):
+    """Shift an array.
+
+    The array is shifted using spline interpolation of the requested order.
+    Points outside the boundaries of the input are filled according to the
+    given mode.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        shift (float or sequence): The shift along the axes. If a float,
+            ``shift`` is the same for each axis. If a sequence, ``shift``
+            should contain one value for each axis.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+        cval (scalar): Value used for points outside the boundaries of
+            the input if ``mode='constant'`` or ``mode='opencv'``. Default is
+            0.0
+        prefilter (bool): It is not used yet. It just exists for compatibility
+            with :mod:`scipy.ndimage`.
+
+    Returns:
+        cupy.ndarray or None:
+            The shifted input.
+
+    .. seealso:: :func:`scipy.ndimage.shift`
+    """
+
+    _check_parameter('shift', order, mode)
+
+    shift = _util._fix_sequence_arg(shift, input.ndim, 'shift', float)
+
+    if mode == 'opencv':
+        mode = '_opencv_edge'
+
+        output = affine_transform(
+            input,
+            cupy.ones(input.ndim, input.dtype),
+            cupy.negative(cupy.asarray(shift)),
+            None,
+            output,
+            order,
+            mode,
+            cval,
+            prefilter,
+        )
+    else:
+        output = _util._get_output(output, input)
+        if input.dtype.kind in 'iu':
+            input = input.astype(cupy.float32)
+        filtered, nprepad = _filter_input(input, prefilter, mode, cval, order)
+        integer_output = output.dtype.kind in 'iu'
+        _util._check_cval(mode, cval, integer_output)
+        large_int = prod(input.shape) > 1 << 31
+        kern = _interp_kernels._get_shift_kernel(
+            input.ndim, large_int, input.shape, mode, cval=cval, order=order,
+            integer_output=integer_output, nprepad=nprepad)
+        shift = cupy.asarray(shift, dtype=cupy.float64, order='C')
+        if shift.ndim != 1:
+            raise ValueError('shift must be 1d')
+        if shift.size != filtered.ndim:
+            raise ValueError('len(shift) must equal input.ndim')
+        kern(filtered, shift, output)
+    return output
+
+
+def zoom(input, zoom, output=None, order=3, mode='constant', cval=0.0,
+         prefilter=True, *, grid_mode=False):
+    """Zoom an array.
+
+    The array is zoomed using spline interpolation of the requested order.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        zoom (float or sequence): The zoom factor along the axes. If a float,
+            ``zoom`` is the same for each axis. If a sequence, ``zoom`` should
+            contain one value for each axis.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array.
+        order (int): The order of the spline interpolation, default is 3. Must
+            be in the range 0-5.
+        mode (str): Points outside the boundaries of the input are filled
+            according to the given mode (``'constant'``, ``'nearest'``,
+            ``'mirror'``, ``'reflect'``, ``'wrap'``, ``'grid-mirror'``,
+            ``'grid-wrap'``, ``'grid-constant'`` or ``'opencv'``).
+        cval (scalar): Value used for points outside the boundaries of
+            the input if ``mode='constant'`` or ``mode='opencv'``. Default is
+            0.0
+        prefilter (bool): It is not used yet. It just exists for compatibility
+            with :mod:`scipy.ndimage`.
+        grid_mode (bool, optional): If False, the distance from the pixel
+            centers is zoomed. Otherwise, the distance including the full pixel
+            extent is used. For example, a 1d signal of length 5 is considered
+            to have length 4 when ``grid_mode`` is False, but length 5 when
+            ``grid_mode`` is True. See the following visual illustration:
+
+            .. code-block:: text
+
+                    | pixel 1 | pixel 2 | pixel 3 | pixel 4 | pixel 5 |
+                         |<-------------------------------------->|
+                                            vs.
+                    |<----------------------------------------------->|
+
+            The starting point of the arrow in the diagram above corresponds to
+            coordinate location 0 in each mode.
+
+    Returns:
+        cupy.ndarray or None:
+            The zoomed input.
+
+    .. seealso:: :func:`scipy.ndimage.zoom`
+    """
+
+    _check_parameter('zoom', order, mode)
+
+    zoom = _util._fix_sequence_arg(zoom, input.ndim, 'zoom', float)
+
+    output_shape = []
+    for s, z in zip(input.shape, zoom):
+        output_shape.append(int(round(s * z)))
+    output_shape = tuple(output_shape)
+
+    if mode == 'opencv':
+        zoom = []
+        offset = []
+        for in_size, out_size in zip(input.shape, output_shape):
+            if out_size > 1:
+                zoom.append(float(in_size) / out_size)
+                offset.append((zoom[-1] - 1) / 2.0)
+            else:
+                zoom.append(0)
+                offset.append(0)
+        mode = 'nearest'
+
+        output = affine_transform(
+            input,
+            cupy.asarray(zoom),
+            offset,
+            output_shape,
+            output,
+            order,
+            mode,
+            cval,
+            prefilter,
+        )
+    else:
+        if grid_mode:
+
+            # warn about modes that may have surprising behavior
+            suggest_mode = None
+            if mode == 'constant':
+                suggest_mode = 'grid-constant'
+            elif mode == 'wrap':
+                suggest_mode = 'grid-wrap'
+            if suggest_mode is not None:
+                warnings.warn(
+                    f'It is recommended to use mode = {suggest_mode} instead '
+                    f'of {mode} when grid_mode is True.')
+
+        zoom = []
+        for in_size, out_size in zip(input.shape, output_shape):
+            if grid_mode and out_size > 0:
+                zoom.append(in_size / out_size)
+            elif out_size > 1:
+                zoom.append((in_size - 1) / (out_size - 1))
+            else:
+                zoom.append(0)
+
+        output = _util._get_output(output, input, shape=output_shape)
+        if input.dtype.kind in 'iu':
+            input = input.astype(cupy.float32)
+        filtered, nprepad = _filter_input(input, prefilter, mode, cval, order)
+        integer_output = output.dtype.kind in 'iu'
+        _util._check_cval(mode, cval, integer_output)
+        large_int = max(prod(input.shape), prod(output_shape)) > 1 << 31
+        kern = _interp_kernels._get_zoom_kernel(
+            input.ndim, large_int, output_shape, mode, order=order,
+            integer_output=integer_output, grid_mode=grid_mode,
+            nprepad=nprepad)
+        zoom = cupy.asarray(zoom, dtype=cupy.float64)
+        kern(filtered, zoom, output)
+    return output
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_morphology.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_morphology.py
new file mode 100644
index 000000000..5c6462cff
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_morphology.py
@@ -0,0 +1,1016 @@
+import operator
+import warnings
+
+import cupy
+import numpy
+from cupy import _core
+
+from cucim.skimage._vendored import _ndimage_filters as _filters
+from cucim.skimage._vendored import _ndimage_filters_core as _filters_core
+from cucim.skimage._vendored import _ndimage_util as _util
+
+
+@cupy.memoize(for_each_device=True)
+def _get_binary_erosion_kernel(
+    w_shape, int_type, offsets, center_is_true, border_value, invert, masked,
+    all_weights_nonzero
+):
+    if invert:
+        border_value = int(not border_value)
+        true_val = 0
+        false_val = 1
+    else:
+        true_val = 1
+        false_val = 0
+
+    if masked:
+        pre = """
+            bool mv = (bool)mask[i];
+            bool _in = (bool)x[i];
+            if (!mv) {{
+                y = cast<Y>(_in);
+                return;
+            }} else if ({center_is_true} && _in == {false_val}) {{
+                y = cast<Y>(_in);
+                return;
+            }}""".format(center_is_true=int(center_is_true),
+                         false_val=false_val)
+    else:
+        pre = """
+            bool _in = (bool)x[i];
+            if ({center_is_true} && _in == {false_val}) {{
+                y = cast<Y>(_in);
+                return;
+            }}""".format(center_is_true=int(center_is_true),
+                         false_val=false_val)
+    pre = pre + """
+            y = cast<Y>({true_val});""".format(true_val=true_val)
+
+    # {{{{ required because format is called again within _generate_nd_kernel
+    found = """
+        if ({{cond}}) {{{{
+            if (!{border_value}) {{{{
+                y = cast<Y>({false_val});
+                return;
+            }}}}
+        }}}} else {{{{
+            bool nn = {{value}} ? {true_val} : {false_val};
+            if (!nn) {{{{
+                y = cast<Y>({false_val});
+                return;
+            }}}}
+        }}}}""".format(true_val=int(true_val),
+                       false_val=int(false_val),
+                       border_value=int(border_value),)
+
+    name = 'binary_erosion'
+    if false_val:
+        name += '_invert'
+    return _filters_core._generate_nd_kernel(
+        name,
+        pre,
+        found,
+        '',
+        'constant', w_shape, int_type, offsets, 0, ctype='Y', has_weights=True,
+        has_structure=False, has_mask=masked, binary_morphology=True,
+        all_weights_nonzero=all_weights_nonzero)
+
+
+def _center_is_true(structure, origin):
+    coor = tuple([oo + ss // 2 for ss, oo in zip(structure.shape, origin)])
+    return bool(structure[coor])  # device synchronization
+
+
+def iterate_structure(structure, iterations, origin=None):
+    """Iterate a structure by dilating it with itself.
+
+    Args:
+        structure(array_like): Structuring element (an array of bools,
+            for example), to be dilated with itself.
+        iterations(int): The number of dilations performed on the structure
+            with itself.
+        origin(int or tuple of int, optional): If origin is None, only the
+            iterated structure is returned. If not, a tuple of the iterated
+            structure and the modified origin is returned.
+
+    Returns:
+        cupy.ndarray: A new structuring element obtained by dilating
+        ``structure`` (``iterations`` - 1) times with itself.
+
+    .. seealso:: :func:`scipy.ndimage.iterate_structure`
+    """
+    if iterations < 2:
+        return structure.copy()
+    ni = iterations - 1
+    shape = [ii + ni * (ii - 1) for ii in structure.shape]
+    pos = [ni * (structure.shape[ii] // 2) for ii in range(len(shape))]
+    slc = tuple(
+        slice(pos[ii], pos[ii] + structure.shape[ii], None)
+        for ii in range(len(shape))
+    )
+    out = cupy.zeros(shape, bool)
+    out[slc] = structure != 0
+    out = binary_dilation(out, structure, iterations=ni)
+    if origin is None:
+        return out
+    else:
+        origin = _util._fix_sequence_arg(origin, structure.ndim, 'origin', int)
+        origin = [iterations * o for o in origin]
+        return out, origin
+
+
+def generate_binary_structure(rank, connectivity):
+    """Generate a binary structure for binary morphological operations.
+
+    Args:
+        rank(int): Number of dimensions of the array to which the structuring
+            element will be applied, as returned by ``np.ndim``.
+        connectivity(int): ``connectivity`` determines which elements of the
+            output array belong to the structure, i.e., are considered as
+            neighbors of the central element. Elements up to a squared distance
+            of ``connectivity`` from the center are considered neighbors.
+            ``connectivity`` may range from 1 (no diagonal elements are
+            neighbors) to ``rank`` (all elements are neighbors).
+
+    Returns:
+        cupy.ndarray: Structuring element which may be used for binary
+        morphological operations, with ``rank`` dimensions and all
+        dimensions equal to 3.
+
+    .. seealso:: :func:`scipy.ndimage.generate_binary_structure`
+    """
+    if connectivity < 1:
+        connectivity = 1
+    if rank < 1:
+        return cupy.asarray(True, dtype=bool)
+    output = numpy.fabs(numpy.indices([3] * rank) - 1)
+    output = numpy.add.reduce(output, 0)
+    output = output <= connectivity
+    return cupy.asarray(output)
+
+
+def _binary_erosion(input, structure, iterations, mask, output, border_value,
+                    origin, invert, brute_force=True):
+    try:
+        iterations = operator.index(iterations)
+    except TypeError:
+        raise TypeError('iterations parameter should be an integer')
+
+    if input.dtype.kind == 'c':
+        raise TypeError('Complex type not supported')
+    if structure is None:
+        structure = generate_binary_structure(input.ndim, 1)
+        all_weights_nonzero = input.ndim == 1
+        center_is_true = True
+        default_structure = True
+    else:
+        structure = structure.astype(dtype=bool, copy=False)
+        # transfer to CPU for use in determining if it is fully dense
+        # structure_cpu = cupy.asnumpy(structure)
+        default_structure = False
+    if structure.ndim != input.ndim:
+        raise RuntimeError('structure and input must have same dimensionality')
+    if not structure.flags.c_contiguous:
+        structure = cupy.ascontiguousarray(structure)
+    if structure.size < 1:
+        raise RuntimeError('structure must not be empty')
+
+    if mask is not None:
+        if mask.shape != input.shape:
+            raise RuntimeError('mask and input must have equal sizes')
+        if not mask.flags.c_contiguous:
+            mask = cupy.ascontiguousarray(mask)
+        masked = True
+    else:
+        masked = False
+    origin = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)
+
+    if isinstance(output, cupy.ndarray):
+        if output.dtype.kind == 'c':
+            raise TypeError('Complex output type not supported')
+    else:
+        output = bool
+    output = _util._get_output(output, input)
+    temp_needed = cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS')
+    if temp_needed:
+        # input and output arrays cannot share memory
+        temp = output
+        output = _util._get_output(output.dtype, input)
+    if structure.ndim == 0:
+        # kernel doesn't handle ndim=0, so special case it here
+        if float(structure):
+            output[...] = cupy.asarray(input, dtype=bool)
+        else:
+            output[...] = ~cupy.asarray(input, dtype=bool)
+        return output
+    origin = tuple(origin)
+    int_type = _util._get_inttype(input)
+    offsets = _filters_core._origins_to_offsets(origin, structure.shape)
+    if not default_structure:
+        # synchronize required to determine if all weights are non-zero
+        nnz = int(cupy.count_nonzero(structure))
+        all_weights_nonzero = nnz == structure.size
+        if all_weights_nonzero:
+            center_is_true = True
+        else:
+            center_is_true = _center_is_true(structure, origin)
+
+    erode_kernel = _get_binary_erosion_kernel(
+        structure.shape, int_type, offsets, center_is_true, border_value,
+        invert, masked, all_weights_nonzero,
+    )
+
+    if iterations == 1:
+        if masked:
+            output = erode_kernel(input, structure, mask, output)
+        else:
+            output = erode_kernel(input, structure, output)
+    elif center_is_true and not brute_force:
+        raise NotImplementedError(
+            'only brute_force iteration has been implemented'
+        )
+    else:
+        if cupy.shares_memory(output, input, 'MAY_SHARE_BOUNDS'):
+            raise ValueError('output and input may not overlap in memory')
+        tmp_in = cupy.empty_like(input, dtype=output.dtype)
+        tmp_out = output
+        if iterations >= 1 and not iterations & 1:
+            tmp_in, tmp_out = tmp_out, tmp_in
+        if masked:
+            tmp_out = erode_kernel(input, structure, mask, tmp_out)
+        else:
+            tmp_out = erode_kernel(input, structure, tmp_out)
+        # TODO: kernel doesn't return the changed status, so determine it here
+        changed = not (input == tmp_out).all()  # synchronize!
+        ii = 1
+        while ii < iterations or ((iterations < 1) and changed):
+            tmp_in, tmp_out = tmp_out, tmp_in
+            if masked:
+                tmp_out = erode_kernel(tmp_in, structure, mask, tmp_out)
+            else:
+                tmp_out = erode_kernel(tmp_in, structure, tmp_out)
+            changed = not (tmp_in == tmp_out).all()
+            ii += 1
+            if not changed and (not ii & 1):  # synchronize!
+                # can exit early if nothing changed
+                # (only do this after even number of tmp_in/out swaps)
+                break
+        output = tmp_out
+    if temp_needed:
+        _core.elementwise_copy(output, temp)
+        output = temp
+    return output
+
+
+def binary_erosion(input, structure=None, iterations=1, mask=None, output=None,
+                   border_value=0, origin=0, brute_force=False):
+    """Multidimensional binary erosion with a given structuring element.
+
+    Binary erosion is a mathematical morphology operation used for image
+    processing.
+
+    Args:
+        input(cupy.ndarray): The input binary array_like to be eroded.
+            Non-zero (True) elements form the subset to be eroded.
+        structure(cupy.ndarray, optional): The structuring element used for the
+            erosion. Non-zero elements are considered True. If no structuring
+            element is provided an element is generated with a square
+            connectivity equal to one. (Default value = None).
+        iterations(int, optional): The erosion is repeated ``iterations`` times
+            (one, by default). If iterations is less than 1, the erosion is
+            repeated until the result does not change anymore. Only an integer
+            of iterations is accepted.
+        mask(cupy.ndarray or None, optional): If a mask is given, only those
+            elements with a True value at the corresponding mask element are
+            modified at each iteration. (Default value = None)
+        output(cupy.ndarray, optional): Array of the same shape as input, into
+            which the output is placed. By default, a new array is created.
+        border_value(int (cast to 0 or 1), optional): Value at the
+            border in the output array. (Default value = 0)
+        origin(int or tuple of ints, optional): Placement of the filter, by
+            default 0.
+        brute_force(boolean, optional): Memory condition: if False, only the
+            pixels whose value was changed in the last iteration are tracked as
+            candidates to be updated (eroded) in the current iteration; if
+            True all pixels are considered as candidates for erosion,
+            regardless of what happened in the previous iteration.
+
+    Returns:
+        cupy.ndarray: The result of binary erosion.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_erosion`
+    """
+    return _binary_erosion(input, structure, iterations, mask, output,
+                           border_value, origin, 0, brute_force)
+
+
+def binary_dilation(input, structure=None, iterations=1, mask=None,
+                    output=None, border_value=0, origin=0, brute_force=False):
+    """Multidimensional binary dilation with the given structuring element.
+
+    Args:
+        input(cupy.ndarray): The input binary array_like to be dilated.
+            Non-zero (True) elements form the subset to be dilated.
+        structure(cupy.ndarray, optional): The structuring element used for the
+            dilation. Non-zero elements are considered True. If no structuring
+            element is provided an element is generated with a square
+            connectivity equal to one. (Default value = None).
+        iterations(int, optional): The dilation is repeated ``iterations``
+            times (one, by default). If iterations is less than 1, the dilation
+            is repeated until the result does not change anymore. Only an
+            integer of iterations is accepted.
+        mask(cupy.ndarray or None, optional): If a mask is given, only those
+            elements with a True value at the corresponding mask element are
+            modified at each iteration. (Default value = None)
+        output(cupy.ndarray, optional): Array of the same shape as input, into
+            which the output is placed. By default, a new array is created.
+        border_value(int (cast to 0 or 1), optional): Value at the
+            border in the output array. (Default value = 0)
+        origin(int or tuple of ints, optional): Placement of the filter, by
+            default 0.
+        brute_force(boolean, optional): Memory condition: if False, only the
+            pixels whose value was changed in the last iteration are tracked as
+            candidates to be updated (dilated) in the current iteration; if
+            True all pixels are considered as candidates for dilation,
+            regardless of what happened in the previous iteration.
+
+    Returns:
+        cupy.ndarray: The result of binary dilation.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_dilation`
+    """
+    if structure is None:
+        structure = generate_binary_structure(input.ndim, 1)
+    origin = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)
+    structure = structure[tuple([slice(None, None, -1)] * structure.ndim)]
+    for ii in range(len(origin)):
+        origin[ii] = -origin[ii]
+        if not structure.shape[ii] & 1:
+            origin[ii] -= 1
+    return _binary_erosion(input, structure, iterations, mask, output,
+                           border_value, origin, 1, brute_force)
+
+
+def binary_opening(input, structure=None, iterations=1, output=None, origin=0,
+                   mask=None, border_value=0, brute_force=False):
+    """
+    Multidimensional binary opening with the given structuring element.
+
+    The *opening* of an input image by a structuring element is the
+    *dilation* of the *erosion* of the image by the structuring element.
+
+    Args:
+        input(cupy.ndarray): The input binary array to be opened.
+            Non-zero (True) elements form the subset to be opened.
+        structure(cupy.ndarray, optional): The structuring element used for the
+            opening. Non-zero elements are considered True. If no structuring
+            element is provided an element is generated with a square
+            connectivity equal to one. (Default value = None).
+        iterations(int, optional): The opening is repeated ``iterations`` times
+            (one, by default). If iterations is less than 1, the opening is
+            repeated until the result does not change anymore. Only an integer
+            of iterations is accepted.
+        output(cupy.ndarray, optional): Array of the same shape as input, into
+            which the output is placed. By default, a new array is created.
+        origin(int or tuple of ints, optional): Placement of the filter, by
+            default 0.
+        mask(cupy.ndarray or None, optional): If a mask is given, only those
+            elements with a True value at the corresponding mask element are
+            modified at each iteration. (Default value = None)
+        border_value(int (cast to 0 or 1), optional): Value at the
+            border in the output array. (Default value = 0)
+        brute_force(boolean, optional): Memory condition: if False, only the
+            pixels whose value was changed in the last iteration are tracked as
+            candidates to be updated (dilated) in the current iteration; if
+            True all pixels are considered as candidates for opening,
+            regardless of what happened in the previous iteration.
+
+    Returns:
+        cupy.ndarray: The result of binary opening.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_opening`
+    """
+    if structure is None:
+        rank = input.ndim
+        structure = generate_binary_structure(rank, 1)
+    tmp = binary_erosion(input, structure, iterations, mask, None,
+                         border_value, origin, brute_force)
+    return binary_dilation(tmp, structure, iterations, mask, output,
+                           border_value, origin, brute_force)
+
+
+def binary_closing(input, structure=None, iterations=1, output=None, origin=0,
+                   mask=None, border_value=0, brute_force=False):
+    """
+    Multidimensional binary closing with the given structuring element.
+
+    The *closing* of an input image by a structuring element is the
+    *erosion* of the *dilation* of the image by the structuring element.
+
+    Args:
+        input(cupy.ndarray): The input binary array to be closed.
+            Non-zero (True) elements form the subset to be closed.
+        structure(cupy.ndarray, optional): The structuring element used for the
+            closing. Non-zero elements are considered True. If no structuring
+            element is provided an element is generated with a square
+            connectivity equal to one. (Default value = None).
+        iterations(int, optional): The closing is repeated ``iterations`` times
+            (one, by default). If iterations is less than 1, the closing is
+            repeated until the result does not change anymore. Only an integer
+            of iterations is accepted.
+        output(cupy.ndarray, optional): Array of the same shape as input, into
+            which the output is placed. By default, a new array is created.
+        origin(int or tuple of ints, optional): Placement of the filter, by
+            default 0.
+        mask(cupy.ndarray or None, optional): If a mask is given, only those
+            elements with a True value at the corresponding mask element are
+            modified at each iteration. (Default value = None)
+        border_value(int (cast to 0 or 1), optional): Value at the
+            border in the output array. (Default value = 0)
+        brute_force(boolean, optional): Memory condition: if False, only the
+            pixels whose value was changed in the last iteration are tracked as
+            candidates to be updated (dilated) in the current iteration; if
+            True all pixels are considered as candidates for closing,
+            regardless of what happened in the previous iteration.
+
+    Returns:
+        cupy.ndarray: The result of binary closing.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_closing`
+    """
+    if structure is None:
+        rank = input.ndim
+        structure = generate_binary_structure(rank, 1)
+    tmp = binary_dilation(input, structure, iterations, mask, None,
+                          border_value, origin, brute_force)
+    return binary_erosion(tmp, structure, iterations, mask, output,
+                          border_value, origin, brute_force)
+
+
+def binary_hit_or_miss(input, structure1=None, structure2=None, output=None,
+                       origin1=0, origin2=None):
+    """
+    Multidimensional binary hit-or-miss transform.
+
+    The hit-or-miss transform finds the locations of a given pattern
+    inside the input image.
+
+    Args:
+        input (cupy.ndarray): Binary image where a pattern is to be detected.
+        structure1 (cupy.ndarray, optional): Part of the structuring element to
+            be fitted to the foreground (non-zero elements) of ``input``. If no
+            value is provided, a structure of square connectivity 1 is chosen.
+        structure2 (cupy.ndarray, optional): Second part of the structuring
+            element that has to miss completely the foreground. If no value is
+            provided, the complementary of ``structure1`` is taken.
+        output (cupy.ndarray, dtype or None, optional): Array of the same shape
+            as input, into which the output is placed. By default, a new array
+            is created.
+        origin1 (int or tuple of ints, optional): Placement of the first part
+            of the structuring element ``structure1``, by default 0 for a
+            centered structure.
+        origin2 (int or tuple of ints or None, optional): Placement of the
+            second part of the structuring element ``structure2``, by default 0
+            for a centered structure. If a value is provided for ``origin1``
+            and not for ``origin2``, then ``origin2`` is set to ``origin1``.
+
+    Returns:
+        cupy.ndarray: Hit-or-miss transform of ``input`` with the given
+        structuring element (``structure1``, ``structure2``).
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_hit_or_miss`
+    """
+    if structure1 is None:
+        structure1 = generate_binary_structure(input.ndim, 1)
+    if structure2 is None:
+        structure2 = cupy.logical_not(structure1)
+    origin1 = _util._fix_sequence_arg(origin1, input.ndim, 'origin1', int)
+    if origin2 is None:
+        origin2 = origin1
+    else:
+        origin2 = _util._fix_sequence_arg(origin2, input.ndim, 'origin2', int)
+
+    tmp1 = _binary_erosion(input, structure1, 1, None, None, 0, origin1, 0,
+                           False)
+    inplace = isinstance(output, cupy.ndarray)
+    result = _binary_erosion(input, structure2, 1, None, output, 0, origin2, 1,
+                             False)
+    if inplace:
+        cupy.logical_not(output, output)
+        cupy.logical_and(tmp1, output, output)
+    else:
+        cupy.logical_not(result, result)
+        return cupy.logical_and(tmp1, result)
+
+
+def binary_propagation(input, structure=None, mask=None, output=None,
+                       border_value=0, origin=0):
+    """
+    Multidimensional binary propagation with the given structuring element.
+
+    Args:
+        input (cupy.ndarray): Binary image to be propagated inside ``mask``.
+        structure (cupy.ndarray, optional): Structuring element used in the
+            successive dilations. The output may depend on the structuring
+            element, especially if ``mask`` has several connex components. If
+            no structuring element is provided, an element is generated with a
+            squared connectivity equal to one.
+        mask (cupy.ndarray, optional): Binary mask defining the region into
+            which ``input`` is allowed to propagate.
+        output (cupy.ndarray, optional): Array of the same shape as input, into
+            which the output is placed. By default, a new array is created.
+        border_value (int, optional): Value at the border in the output array.
+            The value is cast to 0 or 1.
+        origin (int or tuple of ints, optional): Placement of the filter.
+
+    Returns:
+        cupy.ndarray : Binary propagation of ``input`` inside ``mask``.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_propagation`
+    """
+    return binary_dilation(input, structure, -1, mask, output, border_value,
+                           origin, brute_force=True)
+
+
+def binary_fill_holes(input, structure=None, output=None, origin=0):
+    """Fill the holes in binary objects.
+
+    Args:
+        input (cupy.ndarray): N-D binary array with holes to be filled.
+        structure (cupy.ndarray, optional):  Structuring element used in the
+            computation; large-size elements make computations faster but may
+            miss holes separated from the background by thin regions. The
+            default element (with a square connectivity equal to one) yields
+            the intuitive result where all holes in the input have been filled.
+        output (cupy.ndarray, dtype or None, optional): Array of the same shape
+            as input, into which the output is placed. By default, a new array
+            is created.
+        origin (int, tuple of ints, optional): Position of the structuring
+            element.
+
+    Returns:
+        cupy.ndarray: Transformation of the initial image ``input`` where holes
+        have been filled.
+
+    .. warning::
+
+        This function may synchronize the device.
+
+    .. seealso:: :func:`scipy.ndimage.binary_fill_holes`
+    """
+    mask = cupy.logical_not(input)
+    tmp = cupy.zeros(mask.shape, bool)
+    inplace = isinstance(output, cupy.ndarray)
+    # TODO (grlee77): set brute_force=False below once implemented
+    if inplace:
+        binary_dilation(tmp, structure, -1, mask, output, 1, origin,
+                        brute_force=True)
+        cupy.logical_not(output, output)
+    else:
+        output = binary_dilation(tmp, structure, -1, mask, None, 1, origin,
+                                 brute_force=True)
+        cupy.logical_not(output, output)
+        return output
+
+
+def grey_erosion(input, size=None, footprint=None, structure=None, output=None,
+                 mode='reflect', cval=0.0, origin=0):
+    """Calculates a greyscale erosion.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the greyscale erosion. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for greyscale erosion. Non-zero values
+            give the set of neighbors of the center over which minimum is
+            chosen.
+        structure (array of ints): Structuring element used for the greyscale
+            erosion. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of greyscale erosion.
+
+    .. seealso:: :func:`scipy.ndimage.grey_erosion`
+    """
+
+    if size is None and footprint is None and structure is None:
+        raise ValueError('size, footprint or structure must be specified')
+
+    return _filters._min_or_max_filter(input, size, footprint, structure,
+                                       output, mode, cval, origin, 'min')
+
+
+def grey_dilation(input, size=None, footprint=None, structure=None,
+                  output=None, mode='reflect', cval=0.0, origin=0):
+    """Calculates a greyscale dilation.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the greyscale dilation. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for greyscale dilation. Non-zero values
+            give the set of neighbors of the center over which maximum is
+            chosen.
+        structure (array of ints): Structuring element used for the greyscale
+            dilation. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of greyscale dilation.
+
+    .. seealso:: :func:`scipy.ndimage.grey_dilation`
+    """
+
+    if size is None and footprint is None and structure is None:
+        raise ValueError('size, footprint or structure must be specified')
+    if structure is not None:
+        structure = cupy.array(structure)
+        structure = structure[tuple([slice(None, None, -1)] * structure.ndim)]
+    if footprint is not None:
+        footprint = cupy.array(footprint)
+        footprint = footprint[tuple([slice(None, None, -1)] * footprint.ndim)]
+
+    origin = _util._fix_sequence_arg(origin, input.ndim, 'origin', int)
+    for i in range(len(origin)):
+        origin[i] = -origin[i]
+        if footprint is not None:
+            sz = footprint.shape[i]
+        elif structure is not None:
+            sz = structure.shape[i]
+        elif numpy.isscalar(size):
+            sz = size
+        else:
+            sz = size[i]
+        if sz % 2 == 0:
+            origin[i] -= 1
+
+    return _filters._min_or_max_filter(input, size, footprint, structure,
+                                       output, mode, cval, origin, 'max')
+
+
+def grey_closing(input, size=None, footprint=None, structure=None,
+                 output=None, mode='reflect', cval=0.0, origin=0):
+    """Calculates a multi-dimensional greyscale closing.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the greyscale closing. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for greyscale closing. Non-zero values
+            give the set of neighbors of the center over which closing is
+            chosen.
+        structure (array of ints): Structuring element used for the greyscale
+            closing. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of greyscale closing.
+
+    .. seealso:: :func:`scipy.ndimage.grey_closing`
+    """
+    if (size is not None) and (footprint is not None):
+        warnings.warn('ignoring size because footprint is set', UserWarning,
+                      stacklevel=2)
+    tmp = grey_dilation(input, size, footprint, structure, None, mode, cval,
+                        origin)
+    return grey_erosion(tmp, size, footprint, structure, output, mode, cval,
+                        origin)
+
+
+def grey_opening(input, size=None, footprint=None, structure=None,
+                 output=None, mode='reflect', cval=0.0, origin=0):
+    """Calculates a multi-dimensional greyscale opening.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the greyscale opening. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for greyscale opening. Non-zero values
+            give the set of neighbors of the center over which opening is
+            chosen.
+        structure (array of ints): Structuring element used for the greyscale
+            opening. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The result of greyscale opening.
+
+    .. seealso:: :func:`scipy.ndimage.grey_opening`
+    """
+    if (size is not None) and (footprint is not None):
+        warnings.warn('ignoring size because footprint is set', UserWarning,
+                      stacklevel=2)
+    tmp = grey_erosion(input, size, footprint, structure, None, mode, cval,
+                       origin)
+    return grey_dilation(tmp, size, footprint, structure, output, mode, cval,
+                         origin)
+
+
+def morphological_gradient(
+    input,
+    size=None,
+    footprint=None,
+    structure=None,
+    output=None,
+    mode='reflect',
+    cval=0.0,
+    origin=0,
+):
+    """
+    Multidimensional morphological gradient.
+
+    The morphological gradient is calculated as the difference between a
+    dilation and an erosion of the input with a given structuring element.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the morphological gradient. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for morphological gradient. Non-zero
+            values give the set of neighbors of the center over which opening
+            is chosen.
+        structure (array of ints): Structuring element used for the
+            morphological gradient. ``structure`` may be a non-flat
+            structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The morphological gradient of the input.
+
+    .. seealso:: :func:`scipy.ndimage.morphological_gradient`
+    """
+    tmp = grey_dilation(
+        input, size, footprint, structure, None, mode, cval, origin
+    )
+    if isinstance(output, cupy.ndarray):
+        grey_erosion(
+            input, size, footprint, structure, output, mode, cval, origin
+        )
+        return cupy.subtract(tmp, output, output)
+    else:
+        return tmp - grey_erosion(
+            input, size, footprint, structure, None, mode, cval, origin
+        )
+
+
+def morphological_laplace(
+    input,
+    size=None,
+    footprint=None,
+    structure=None,
+    output=None,
+    mode='reflect',
+    cval=0.0,
+    origin=0,
+):
+    """
+    Multidimensional morphological laplace.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the morphological laplace. Optional if ``footprint`` or
+            ``structure`` is provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for morphological laplace. Non-zero
+            values give the set of neighbors of the center over which opening
+            is chosen.
+        structure (array of ints): Structuring element used for the
+            morphological laplace. ``structure`` may be a non-flat
+            structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: The morphological laplace of the input.
+
+    .. seealso:: :func:`scipy.ndimage.morphological_laplace`
+    """
+    tmp1 = grey_dilation(
+        input, size, footprint, structure, None, mode, cval, origin
+    )
+    if isinstance(output, cupy.ndarray):
+        grey_erosion(
+            input, size, footprint, structure, output, mode, cval, origin
+        )
+        cupy.add(tmp1, output, output)
+        cupy.subtract(output, input, output)
+        return cupy.subtract(output, input, output)
+    else:
+        tmp2 = grey_erosion(
+            input, size, footprint, structure, None, mode, cval, origin
+        )
+        cupy.add(tmp1, tmp2, tmp2)
+        cupy.subtract(tmp2, input, tmp2)
+        cupy.subtract(tmp2, input, tmp2)
+        return tmp2
+
+
+def white_tophat(
+    input,
+    size=None,
+    footprint=None,
+    structure=None,
+    output=None,
+    mode='reflect',
+    cval=0.0,
+    origin=0,
+):
+    """
+    Multidimensional white tophat filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the white tophat. Optional if ``footprint`` or ``structure`` is
+            provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for the white tophat. Non-zero values
+            give the set of neighbors of the center over which opening is
+            chosen.
+        structure (array of ints): Structuring element used for the white
+            tophat. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarray: Result of the filter of ``input`` with ``structure``.
+
+    .. seealso:: :func:`scipy.ndimage.white_tophat`
+    """
+    if (size is not None) and (footprint is not None):
+        warnings.warn(
+            'ignoring size because footprint is set', UserWarning, stacklevel=2
+        )
+    tmp = grey_erosion(
+        input, size, footprint, structure, None, mode, cval, origin
+    )
+    tmp = grey_dilation(
+        tmp, size, footprint, structure, output, mode, cval, origin
+    )
+    if input.dtype == numpy.bool_ and tmp.dtype == numpy.bool_:
+        cupy.bitwise_xor(input, tmp, out=tmp)
+    else:
+        cupy.subtract(input, tmp, out=tmp)
+    return tmp
+
+
+def black_tophat(
+    input,
+    size=None,
+    footprint=None,
+    structure=None,
+    output=None,
+    mode='reflect',
+    cval=0.0,
+    origin=0,
+):
+    """
+    Multidimensional black tophat filter.
+
+    Args:
+        input (cupy.ndarray): The input array.
+        size (tuple of ints): Shape of a flat and full structuring element used
+            for the black tophat. Optional if ``footprint`` or ``structure`` is
+            provided.
+        footprint (array of ints): Positions of non-infinite elements of a flat
+            structuring element used for the black tophat. Non-zero values
+            give the set of neighbors of the center over which opening is
+            chosen.
+        structure (array of ints): Structuring element used for the black
+            tophat. ``structure`` may be a non-flat structuring element.
+        output (cupy.ndarray, dtype or None): The array in which to place the
+            output.
+        mode (str): The array borders are handled according to the given mode
+            (``'reflect'``, ``'constant'``, ``'nearest'``, ``'mirror'``,
+            ``'wrap'``). Default is ``'reflect'``.
+        cval (scalar): Value to fill past edges of input if mode is
+            ``constant``. Default is ``0.0``.
+        origin (scalar or tuple of scalar): The origin parameter controls the
+            placement of the filter, relative to the center of the current
+            element of the input. Default of 0 is equivalent to
+            ``(0,)*input.ndim``.
+
+    Returns:
+        cupy.ndarry : Result of the filter of ``input`` with ``structure``.
+
+    .. seealso:: :func:`scipy.ndimage.black_tophat`
+    """
+    if (size is not None) and (footprint is not None):
+        warnings.warn(
+            'ignoring size because footprint is set', UserWarning, stacklevel=2
+        )
+    tmp = grey_dilation(
+        input, size, footprint, structure, None, mode, cval, origin
+    )
+    tmp = grey_erosion(
+        tmp, size, footprint, structure, output, mode, cval, origin
+    )
+    if input.dtype == numpy.bool_ and tmp.dtype == numpy.bool_:
+        cupy.bitwise_xor(tmp, input, out=tmp)
+    else:
+        cupy.subtract(tmp, input, out=tmp)
+    return tmp
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_kernel_weights.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_kernel_weights.py
new file mode 100644
index 000000000..b2fc84449
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_kernel_weights.py
@@ -0,0 +1,73 @@
+"""Determination of spline kernel weights (adapted from SciPy)
+
+See more verbose comments for each case there:
+https://github.com/scipy/scipy/blob/eba29d69846ab1299976ff4af71c106188397ccc/scipy/ndimage/src/ni_splines.c#L7   # NOQA
+
+``spline_weights_inline`` is a dict where the key is the spline order and the
+value is the spline weight initialization code.
+"""
+
+spline_weights_inline = {}
+
+# Note: This order = 1 case is currently unused (order = 1 has a different code
+# path in _interp_kernels.py). I think that existing code is a bit more
+# efficient.
+spline_weights_inline[1] = '''
+wx = c_{j} - floor({order} & 1 ? c_{j} : c_{j} + 0.5);
+weights_{j}[0] = 1.0 - wx;
+weights_{j}[1] = wx;
+'''
+
+spline_weights_inline[2] = '''
+wx = c_{j} - floor({order} & 1 ? c_{j} : c_{j} + 0.5);
+weights_{j}[1] = 0.75 - wx * wx;
+wy = 0.5 - wx;
+weights_{j}[0] = 0.5 * wy * wy;
+weights_{j}[2] = 1.0 - weights_{j}[0] - weights_{j}[1];
+'''
+
+spline_weights_inline[3] = '''
+wx = c_{j} - floor({order} & 1 ? c_{j} : c_{j} + 0.5);
+wy = 1.0 - wx;
+weights_{j}[1] = (wx * wx * (wx - 2.0) * 3.0 + 4.0) / 6.0;
+weights_{j}[2] = (wy * wy * (wy - 2.0) * 3.0 + 4.0) / 6.0;
+weights_{j}[0] = wy * wy * wy / 6.0;
+weights_{j}[3] = 1.0 - weights_{j}[0] - weights_{j}[1] - weights_{j}[2];
+'''
+
+spline_weights_inline[4] = '''
+wx = c_{j} - floor({order} & 1 ? c_{j} : c_{j} + 0.5);
+wy = wx * wx;
+weights_{j}[2] = wy * (wy * 0.25 - 0.625) + 115.0 / 192.0;
+wy = 1.0 + wx;
+weights_{j}[1] = wy * (wy * (wy * (5.0 - wy) / 6.0 - 1.25) + 5.0 / 24.0) +
+             55.0 / 96.0;
+wy = 1.0 - wx;
+weights_{j}[3] = wy * (wy * (wy * (5.0 - wy) / 6.0 - 1.25) + 5.0 / 24.0) +
+             55.0 / 96.0;
+wy = 0.5 - wx;
+wy = wy * wy;
+weights_{j}[0] = wy * wy / 24.0;
+weights_{j}[4] = 1.0 - weights_{j}[0] - weights_{j}[1]
+                     - weights_{j}[2] - weights_{j}[3];
+'''
+
+spline_weights_inline[5] = '''
+wx = c_{j} - floor({order} & 1 ? c_{j} : c_{j} + 0.5);
+wy = wx * wx;
+weights_{j}[2] = wy * (wy * (0.25 - wx / 12.0) - 0.5) + 0.55;
+wy = 1.0 - wx;
+wy = wy * wy;
+weights_{j}[3] = wy * (wy * (0.25 - (1.0 - wx) / 12.0) - 0.5) + 0.55;
+wy = wx + 1.0;
+weights_{j}[1] = wy * (wy * (wy * (wy * (wy / 24.0 - 0.375) + 1.25) - 1.75)
+                   + 0.625) + 0.425;
+wy = 2.0 - wx;
+weights_{j}[4] = wy * (wy * (wy * (wy * (wy / 24.0 - 0.375) + 1.25) - 1.75)
+                  + 0.625) + 0.425;
+wy = 1.0 - wx;
+wy = wy * wy;
+weights_{j}[0] = (1.0 - wx) * wy * wy / 120.0;
+weights_{j}[5] = 1.0 - weights_{j}[0] - weights_{j}[1] - weights_{j}[2]
+                     - weights_{j}[3] - weights_{j}[4];
+'''
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_prefilter_core.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_prefilter_core.py
new file mode 100644
index 000000000..c44df836e
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_spline_prefilter_core.py
@@ -0,0 +1,256 @@
+"""
+Spline poles and boundary handling implemented as in SciPy
+
+https://github.com/scipy/scipy/blob/ee6ae72f83a0995aeb34929aed881d3f36fccfda/scipy/ndimage/src/ni_splines.c  # noqa
+"""
+import functools
+import math
+import operator
+import textwrap
+
+import cupy
+
+
+def get_poles(order):
+    if order == 2:
+        # sqrt(8.0) - 3.0
+        return (-0.171572875253809902396622551580603843,)
+    elif order == 3:
+        # sqrt(3.0) - 2.0
+        return (-0.267949192431122706472553658494127633,)
+    elif order == 4:
+        # sqrt(664.0 - sqrt(438976.0)) + sqrt(304.0) - 19.0
+        # sqrt(664.0 + sqrt(438976.0)) - sqrt(304.0) - 19.0
+        return (-0.361341225900220177092212841325675255,
+                -0.013725429297339121360331226939128204)
+    elif order == 5:
+        # sqrt(67.5 - sqrt(4436.25)) + sqrt(26.25) - 6.5
+        # sqrt(67.5 + sqrt(4436.25)) - sqrt(26.25) - 6.5
+        return (-0.430575347099973791851434783493520110,
+                -0.043096288203264653822712376822550182)
+    else:
+        raise ValueError('only order 2-5 supported')
+
+
+def get_gain(poles):
+    return functools.reduce(operator.mul,
+                            [(1.0 - z) * (1.0 - 1.0 / z) for z in poles])
+
+
+def _causal_init_code(mode):
+    """Code for causal initialization step of IIR filtering.
+
+    c is a 1d array of length n and z is a filter pole
+    """
+    code = f'''
+        // causal init for mode={mode}'''
+    if mode == 'mirror':
+        code += '''
+        z_i = z;
+        z_n_1 = pow(z, (P)(n - 1));
+
+        c[0] = c[0] + z_n_1 * c[(n - 1) * element_stride];
+        for (i = 1; i < min(n - 1, static_cast<idx_t>({n_boundary})); ++i) {{
+            c[0] += z_i * (c[i * element_stride] +
+                           z_n_1 * c[(n - 1 - i) * element_stride]);
+            z_i *= z;
+        }}
+        c[0] /= 1 - z_n_1 * z_n_1;'''
+    elif mode == 'grid-wrap':
+        code += '''
+        z_i = z;
+
+        for (i = 1; i < min(n, static_cast<idx_t>({n_boundary})); ++i) {{
+            c[0] += z_i * c[(n - i) * element_stride];
+            z_i *= z;
+        }}
+        c[0] /= 1 - z_i; /* z_i = pow(z, n) */'''
+    elif mode == 'reflect':
+        code += '''
+        z_i = z;
+        z_n = pow(z, (P)n);
+        c0 = c[0];
+
+        c[0] = c[0] + z_n * c[(n - 1) * element_stride];
+        for (i = 1; i < min(n, static_cast<idx_t>({n_boundary})); ++i) {{
+            c[0] += z_i * (c[i * element_stride] +
+                           z_n * c[(n - 1 - i) * element_stride]);
+            z_i *= z;
+        }}
+        c[0] *= z / (1 - z_n * z_n);
+        c[0] += c0;'''
+    else:
+        raise ValueError('invalid mode: {}'.format(mode))
+    return code
+
+
+def _anticausal_init_code(mode):
+    """Code for the anti-causal initialization step of IIR filtering.
+
+    c is a 1d array of length n and z is a filter pole
+    """
+    code = f'''
+        // anti-causal init for mode={mode}'''
+    if mode == 'mirror':
+        code += '''
+        c[(n - 1) * element_stride] = (
+            z * c[(n - 2) * element_stride] +
+            c[(n - 1) * element_stride]) * z / (z * z - 1);'''
+    elif mode == 'grid-wrap':
+        code += '''
+        z_i = z;
+
+        for (i = 0; i < min(n - 1, static_cast<idx_t>({n_boundary})); ++i) {{
+            c[(n - 1) * element_stride] += z_i * c[i * element_stride];
+            z_i *= z;
+        }}
+        c[(n - 1) * element_stride] *= z / (z_i - 1); /* z_i = pow(z, n) */'''
+    elif mode == 'reflect':
+        code += '''
+        c[(n - 1) * element_stride] *= z / (z - 1);'''
+    else:
+        raise ValueError('invalid mode: {}'.format(mode))
+    return code
+
+
+def _get_spline_mode(mode):
+    """spline boundary mode for interpolation with order >= 2."""
+    if mode in ['mirror', 'reflect', 'grid-wrap']:
+        # exact analytic boundary conditions exist for these modes.
+        return mode
+    elif mode == 'grid-mirror':
+        # grid-mirror is a synonym for 'reflect'
+        return 'reflect'
+    # No exact analytical spline boundary condition implemented. Reflect gives
+    # lower error than using mirror or wrap for mode 'nearest'. Otherwise, a
+    # mirror spline boundary condition is used.
+    return 'reflect' if mode == 'nearest' else 'mirror'
+
+
+def _get_spline1d_code(mode, poles, n_boundary):
+    """Generates the code required for IIR filtering of a single 1d signal.
+
+    Prefiltering is done by causal filtering followed by anti-causal filtering.
+    Multiple boundary conditions have been implemented.
+    """
+    code = ['''
+    __device__ void spline_prefilter1d(
+        T* __restrict__ c, idx_t signal_length, idx_t element_stride)
+    {{''']
+
+    # variables common to all boundary modes
+    code.append('''
+        idx_t i, n = signal_length;
+        P z, z_i;''')
+
+    # retrieve the spline boundary extension mode to use
+    mode = _get_spline_mode(mode)
+
+    if mode == 'mirror':
+        # variables specific to mirror boundary mode
+        code.append('''
+        P z_n_1;''')
+    elif mode == 'reflect':
+        # variables specific to reflect boundary mode
+        code.append('''
+        P z_n;
+        T c0;''')
+
+    for pole in poles:
+
+        code.append(f'''
+        // select the current pole
+        z = {pole};''')
+
+        # initialize and apply the causal filter
+        code.append(_causal_init_code(mode))
+        code.append('''
+        // apply the causal filter for the current pole
+        for (i = 1; i < n; ++i) {{
+            c[i * element_stride] += z * c[(i - 1) * element_stride];
+        }}''')
+        # initialize and apply the anti-causal filter
+        code.append(_anticausal_init_code(mode))
+        code.append('''
+        // apply the anti-causal filter for the current pole
+        for (i = n - 2; i >= 0; --i) {{
+            c[i * element_stride] = z * (c[(i + 1) * element_stride] -
+                                         c[i * element_stride]);
+        }}''')
+
+    code += ['''
+    }}''']
+    return textwrap.dedent('\n'.join(code)).format(n_boundary=n_boundary)
+
+
+_FILTER_GENERAL = '''
+#include "cupy/carray.cuh"
+#include "cupy/complex.cuh"
+typedef {data_type} T;
+typedef {pole_type} P;
+typedef {index_type} idx_t;
+template <typename T>
+__device__ T* row(
+        T* ptr, idx_t i, idx_t axis, idx_t ndim, const idx_t* shape) {{
+    idx_t index = 0, stride = 1;
+    for (idx_t a = ndim - 1; a > 0; --a) {{
+        if (a != axis) {{
+            index += (i % shape[a]) * stride;
+            i /= shape[a];
+        }}
+        stride *= shape[a];
+    }}
+    return ptr + index + stride * i;
+}}
+'''
+
+
+_batch_spline1d_strided_template = """
+extern "C" __global__
+__launch_bounds__({block_size})
+void {kernel_name}(T* __restrict__ y, const idx_t* __restrict__ info) {{
+    const idx_t n_signals = info[0], n_samples = info[1],
+        * __restrict__ shape = info+2;
+    idx_t y_elem_stride = 1;
+    for (int a = {ndim} - 1; a > {axis}; --a) {{ y_elem_stride *= shape[a]; }}
+    idx_t unraveled_idx = blockDim.x * blockIdx.x + threadIdx.x;
+    idx_t batch_idx = unraveled_idx;
+    if (batch_idx < n_signals)
+    {{
+        T* __restrict__ y_i = row(y, batch_idx, {axis}, {ndim}, shape);
+        spline_prefilter1d(y_i, n_samples, y_elem_stride);
+    }}
+}}
+"""
+
+
+@cupy.memoize(for_each_device=True)
+def get_raw_spline1d_kernel(axis, ndim, mode, order, index_type='int',
+                            data_type='double', pole_type='double',
+                            block_size=128):
+    """Generate a kernel for applying a spline prefilter along a given axis."""
+    poles = get_poles(order)
+
+    # determine number of samples for the boundary approximation
+    # (SciPy uses n_boundary = n_samples but this is excessive)
+    largest_pole = max([abs(p) for p in poles])
+    # tol < 1e-7 fails test cases comparing to SciPy at atol = rtol = 1e-5
+    tol = 1e-10 if pole_type == 'float' else 1e-18
+    n_boundary = math.ceil(math.log(tol, largest_pole))
+
+    # headers and general utility function for extracting rows of data
+    code = _FILTER_GENERAL.format(index_type=index_type,
+                                  data_type=data_type,
+                                  pole_type=pole_type)
+
+    # generate source for a 1d function for a given boundary mode and poles
+    code += _get_spline1d_code(mode, poles, n_boundary)
+
+    # generate code handling batch operation of the 1d filter
+    mode_str = mode.replace('-', '_')  # cannot have '-' in kernel name
+    kernel_name = (f'cupyx_scipy_ndimage_spline_filter_{ndim}d_ord{order}_'
+                   f'axis{axis}_{mode_str}')
+    code += _batch_spline1d_strided_template.format(ndim=ndim, axis=axis,
+                                                    block_size=block_size,
+                                                    kernel_name=kernel_name)
+    return cupy.RawKernel(code, kernel_name)
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_util.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_util.py
index 1d87b3256..1a6a830a5 100644
--- a/python/cucim/src/cucim/skimage/_vendored/_ndimage_util.py
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_util.py
@@ -5,6 +5,33 @@
 import numpy
 
 
+def _is_integer_output(output, input):
+    if output is None:
+        return input.dtype.kind in 'iu'
+    elif isinstance(output, cupy.ndarray):
+        return output.dtype.kind in 'iu'
+    return cupy.dtype(output).kind in 'iu'
+
+
+def _check_cval(mode, cval, integer_output):
+    if mode == 'constant' and integer_output and not cupy.isfinite(cval):
+        raise NotImplementedError("Non-finite cval is not supported for "
+                                  "outputs with integer dtype.")
+
+
+def _get_weights_dtype(input, weights, use_cucim_casting=False):
+    if weights.dtype.kind == "c" or input.dtype.kind == "c":
+        return cupy.promote_types(input.real.dtype, cupy.complex64)
+    elif weights.dtype.kind in 'iub':
+        if use_cucim_casting:
+            from cucim.skimage._shared.utils import _supported_float_type
+            return _supported_float_type(weights.dtype)
+        else:
+            # convert integer dtype weights to double as in SciPy
+            return cupy.float64
+    return cupy.promote_types(input.real.dtype, cupy.float32)
+
+
 def _get_output(output, input, shape=None, complex_output=False):
     shape = input.shape if shape is None else shape
     if output is None:
@@ -12,17 +39,17 @@ def _get_output(output, input, shape=None, complex_output=False):
             _dtype = cupy.promote_types(input.dtype, cupy.complex64)
         else:
             _dtype = input.dtype
-        output = cupy.zeros(shape, dtype=_dtype)
+        output = cupy.empty(shape, dtype=_dtype)
     elif isinstance(output, (type, cupy.dtype)):
         if complex_output and cupy.dtype(output).kind != 'c':
             warnings.warn("promoting specified output dtype to complex")
             output = cupy.promote_types(output, cupy.complex64)
-        output = cupy.zeros(shape, dtype=output)
+        output = cupy.empty(shape, dtype=output)
     elif isinstance(output, str):
         output = numpy.sctypeDict[output]
         if complex_output and cupy.dtype(output).kind != 'c':
             raise RuntimeError("output must have complex dtype")
-        output = cupy.zeros(shape, dtype=output)
+        output = cupy.empty(shape, dtype=output)
     elif output.shape != shape:
         raise RuntimeError("output shape not correct")
     elif complex_output and output.dtype.kind != 'c':
@@ -44,6 +71,13 @@ def _fix_sequence_arg(arg, ndim, name, conv=lambda x: x):
     return lst
 
 
+def _check_origin(origin, width):
+    origin = int(origin)
+    if (width // 2 + origin < 0) or (width // 2 + origin >= width):
+        raise ValueError('invalid origin')
+    return origin
+
+
 def _check_mode(mode):
     if mode not in ('reflect', 'constant', 'nearest', 'mirror', 'wrap',
                     'grid-mirror', 'grid-wrap', 'grid-reflect'):
@@ -52,13 +86,6 @@ def _check_mode(mode):
     return mode
 
 
-def _check_origin(origin, width):
-    origin = int(origin)
-    if (width // 2 + origin < 0) or (width // 2 + origin >= width):
-        raise ValueError('invalid origin')
-    return origin
-
-
 def _get_inttype(input):
     # The integer type to use for indices in the input array
     # The indices actually use byte positions and we can't just use
@@ -70,53 +97,118 @@ def _get_inttype(input):
 
 
 def _generate_boundary_condition_ops(mode, ix, xsize, int_t="int",
-                                     float_ix=False):
+                                     float_ix=False, separate=False):
+    """Generate boundary conditions
+
+    If separate = True, a pair of conditions for the (lower, upper) boundary
+    are provided instead of a single expression.
+    """
     min_func = "fmin" if float_ix else "min"
     max_func = "fmax" if float_ix else "max"
     if mode in ['reflect', 'grid-mirror']:
-        ops = '''
-        if ({ix} < 0) {{
-            {ix} = - 1 -{ix};
-        }}
-        {ix} %= {xsize} * 2;
-        {ix} = {min}({ix}, 2 * {xsize} - 1 - {ix});'''.format(
-            ix=ix, xsize=xsize, min=min_func)
-    elif mode == 'mirror':
-        ops = '''
-        if ({xsize} == 1) {{
-            {ix} = 0;
-        }} else {{
+        if separate:
+            ops_upper = f'''
+            {ix} %= {xsize} * 2;
+            {ix} = {min_func}({ix}, 2 * {xsize} - 1 - {ix});
+            '''
+            ops_lower = f'''
             if ({ix} < 0) {{
-                {ix} = -{ix};
+                {ix} = - 1 -{ix};
             }}
-            {ix} = 1 + ({ix} - 1) % (({xsize} - 1) * 2);
-            {ix} = {min}({ix}, 2 * {xsize} - 2 - {ix});
-        }}'''.format(ix=ix, xsize=xsize, min=min_func)
+            ''' + ops_upper
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''
+            if ({ix} < 0) {{
+                {ix} = - 1 -{ix};
+            }}
+            {ix} %= {xsize} * 2;
+            {ix} = {min_func}({ix}, 2 * {xsize} - 1 - {ix});'''
+    elif mode == 'mirror':
+        if separate:
+            temp1 = f'''
+            if ({xsize} == 1) {{
+                {ix} = 0;
+            }} else {{
+            '''
+            temp2 = f'''
+                if ({ix} < 0) {{
+                    {ix} = -{ix};
+                }}
+            '''
+            temp3 = f'''
+                {ix} = 1 + ({ix} - 1) % (({xsize} - 1) * 2);
+                {ix} = {min_func}({ix}, 2 * {xsize} - 2 - {ix});
+            }}'''
+            ops_lower = temp1 + temp2 + temp3
+            ops_upper = temp1 + temp3
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''
+            if ({xsize} == 1) {{
+                {ix} = 0;
+            }} else {{
+                if ({ix} < 0) {{
+                    {ix} = -{ix};
+                }}
+                {ix} = 1 + ({ix} - 1) % (({xsize} - 1) * 2);
+                {ix} = {min_func}({ix}, 2 * {xsize} - 2 - {ix});
+            }}'''
     elif mode == 'nearest':
-        ops = '''
-        {ix} = {min}({max}(({T}){ix}, ({T})0), ({T})({xsize} - 1));'''.format(
-            ix=ix, xsize=xsize, min=min_func, max=max_func,
-            # force using 64-bit signed integer for ptrdiff_t,
-            # see cupy/cupy#6048
-            T=('int' if int_t == 'int' else 'long long'))
+        T = 'int' if int_t == 'int' else 'long long'
+        if separate:
+            ops_lower = f'''{ix} = {max_func}(({T}){ix}, ({T})0);'''
+            ops_upper = f'''{ix} = {min_func}(({T}){ix}, ({T})({xsize} - 1));'''  # noqa
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''{ix} = {min_func}({max_func}(({T}){ix}, ({T})0), ({T})({xsize} - 1));'''  # noqa
     elif mode == 'grid-wrap':
-        ops = '''
-        {ix} %= {xsize};
-        if ({ix} < 0) {{
-            {ix} += {xsize};
-        }}'''.format(ix=ix, xsize=xsize)
+        if separate:
+            ops_upper = f'''
+            {ix} %= {xsize};
+            '''
+            ops_lower = ops_upper + f'''
+            if ({ix} < 0) {{
+                {ix} += {xsize};
+            }}'''
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''
+            {ix} %= {xsize};
+            if ({ix} < 0) {{
+                {ix} += {xsize};
+            }}'''
+
     elif mode == 'wrap':
-        ops = '''
-        if ({ix} < 0) {{
-            {ix} += ({sz} - 1) * (({int_t})(-{ix} / ({sz} - 1)) + 1);
-        }} else if ({ix} > ({sz} - 1)) {{
-            {ix} -= ({sz} - 1) * ({int_t})({ix} / ({sz} - 1));
-        }};'''.format(ix=ix, sz=xsize, int_t=int_t)
+        if separate:
+            ops_lower = f'''{ix} += ({xsize} - 1) * (({int_t})(-{ix} / ({xsize} - 1)) + 1);'''  # noqa
+            ops_upper = f'''{ix} -= ({xsize} - 1) * ({int_t})({ix} / ({xsize} - 1));'''  # noqa
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''
+            if ({ix} < 0) {{
+                {ix} += ({xsize} - 1) * (({int_t})(-{ix} / ({xsize} - 1)) + 1);
+            }} else if ({ix} > ({xsize} - 1)) {{
+                {ix} -= ({xsize} - 1) * ({int_t})({ix} / ({xsize} - 1));
+            }};'''
     elif mode in ['constant', 'grid-constant']:
-        ops = '''
-        if (({ix} < 0) || {ix} >= {xsize}) {{
-            {ix} = -1;
-        }}'''.format(ix=ix, xsize=xsize)
+        if separate:
+            ops_lower = f'''
+            if ({ix} < 0) {{
+                {ix} = -1;
+            }}'''
+            ops_upper = f'''
+            if ({ix} >= {xsize}) {{
+                {ix} = -1;
+            }}'''
+            ops = (ops_lower, ops_upper)
+        else:
+            ops = f'''
+            if (({ix} < 0) || {ix} >= {xsize}) {{
+                {ix} = -1;
+            }}'''
+        if separate:
+            ops = (ops, ops)
     return ops
 
 
diff --git a/python/cucim/src/cucim/skimage/_vendored/_texture.py b/python/cucim/src/cucim/skimage/_vendored/_texture.py
new file mode 100644
index 000000000..5c18bce71
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/_texture.py
@@ -0,0 +1,194 @@
+import cupy
+from cupy import _core
+from cupy.cuda import runtime, texture
+
+_affine_transform_2d_array_kernel = _core.ElementwiseKernel(
+    'U texObj, raw float32 m, uint64 width', 'T transformed_image',
+    '''
+    float3 pixel = make_float3(
+        (float)(i / width),
+        (float)(i % width),
+        1.0f
+    );
+    float x = dot(pixel, make_float3(m[0],  m[1],  m[2])) + .5f;
+    float y = dot(pixel, make_float3(m[3],  m[4],  m[5])) + .5f;
+    transformed_image = tex2D<T>(texObj, y, x);
+    ''',
+    'cupyx_texture_affine_transformation_2d_array',
+    preamble='''
+    inline __host__ __device__ float dot(float3 a, float3 b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+    ''')
+
+
+_affine_transform_3d_array_kernel = _core.ElementwiseKernel(
+    'U texObj, raw float32 m, uint64 height, uint64 width',
+    'T transformed_volume',
+    '''
+    float4 voxel = make_float4(
+        (float)(i / (width * height)),
+        (float)((i % (width * height)) / width),
+        (float)((i % (width * height)) % width),
+        1.0f
+    );
+    float x = dot(voxel, make_float4(m[0],  m[1],  m[2],  m[3])) + .5f;
+    float y = dot(voxel, make_float4(m[4],  m[5],  m[6],  m[7])) + .5f;
+    float z = dot(voxel, make_float4(m[8],  m[9],  m[10], m[11])) + .5f;
+    transformed_volume = tex3D<T>(texObj, z, y, x);
+    ''',
+    'cupyx_texture_affine_transformation_3d_array',
+    preamble='''
+    inline __host__ __device__ float dot(float4 a, float4 b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+    ''')
+
+
+def _create_texture_object(data,
+                           address_mode: str,
+                           filter_mode: str,
+                           read_mode: str,
+                           border_color=0):
+
+    if cupy.issubdtype(data.dtype, cupy.unsignedinteger):
+        fmt_kind = runtime.cudaChannelFormatKindUnsigned
+    elif cupy.issubdtype(data.dtype, cupy.integer):
+        fmt_kind = runtime.cudaChannelFormatKindSigned
+    elif cupy.issubdtype(data.dtype, cupy.floating):
+        fmt_kind = runtime.cudaChannelFormatKindFloat
+    else:
+        raise ValueError(f'Unsupported data type {data.dtype}')
+
+    if address_mode == 'nearest':
+        address_mode = runtime.cudaAddressModeClamp
+    elif address_mode == 'constant':
+        address_mode = runtime.cudaAddressModeBorder
+    else:
+        raise ValueError(
+            f'Unsupported address mode {address_mode} '
+            '(supported: constant, nearest)')
+
+    if filter_mode == 'nearest':
+        filter_mode = runtime.cudaFilterModePoint
+    elif filter_mode == 'linear':
+        filter_mode = runtime.cudaFilterModeLinear
+    else:
+        raise ValueError(
+            f'Unsupported filter mode {filter_mode} '
+            f'(supported: nearest, linear)')
+
+    if read_mode == 'element_type':
+        read_mode = runtime.cudaReadModeElementType
+    elif read_mode == 'normalized_float':
+        read_mode = runtime.cudaReadModeNormalizedFloat
+    else:
+        raise ValueError(
+            f'Unsupported read mode {read_mode} '
+            '(supported: element_type, normalized_float)')
+
+    texture_fmt = texture.ChannelFormatDescriptor(
+        data.itemsize * 8, 0, 0, 0, fmt_kind)
+    # CUDAArray: last dimension is the fastest changing dimension
+    array = texture.CUDAarray(texture_fmt, *data.shape[::-1])
+    res_desc = texture.ResourceDescriptor(
+        runtime.cudaResourceTypeArray, cuArr=array)
+    # TODO(the-lay): each dimension can have a different addressing mode
+    # TODO(the-lay): border color/value can be defined for up to 4 channels
+    tex_desc = texture.TextureDescriptor(
+        (address_mode, ) * data.ndim, filter_mode, read_mode,
+        borderColors=(border_color, ))
+    tex_obj = texture.TextureObject(res_desc, tex_desc)
+    array.copy_from(data)
+
+    return tex_obj
+
+
+def affine_transformation(data,
+                          transformation_matrix,
+                          output_shape=None,
+                          output=None,
+                          interpolation: str = 'linear',
+                          mode: str = 'constant',
+                          border_value=0):
+    """
+    Apply an affine transformation.
+
+    The method uses texture memory and supports only 2D and 3D float32 arrays
+    without channel dimension.
+
+    Args:
+        data (cupy.ndarray): The input array or texture object.
+        transformation_matrix (cupy.ndarray): Affine transformation matrix.
+            Must be a homogeneous and have shape ``(ndim + 1, ndim + 1)``.
+        output_shape (tuple of ints): Shape of output. If not specified,
+            the input array shape is used. Default is None.
+        output (cupy.ndarray or ~cupy.dtype): The array in which to place the
+            output, or the dtype of the returned array. If not specified,
+            creates the output array with shape of ``output_shape``. Default is
+            None.
+        interpolation (str): Specifies interpolation mode: ``'linear'`` or
+            ``'nearest'``. Default is ``'linear'``.
+        mode (str): Specifies addressing mode for points outside of the array:
+            (`'constant'``, ``'nearest'``). Default is ``'constant'``.
+        border_value: Specifies value to be used for coordinates outside
+            of the array for ``'constant'`` mode. Default is 0.
+
+    Returns:
+        cupy.ndarray:
+            The transformed input.
+
+    .. seealso:: :func:`cupyx.scipy.ndimage.affine_transform`
+    """
+
+    ndim = data.ndim
+    if (ndim < 2) or (ndim > 3):
+        raise ValueError(
+            'Texture memory affine transformation is defined only for '
+            '2D and 3D arrays without channel dimension.')
+
+    dtype = data.dtype
+    if dtype != cupy.float32:
+        raise ValueError(f'Texture memory affine transformation is available '
+                         f'only for float32 data type (not {dtype})')
+
+    if interpolation not in ['linear', 'nearest']:
+        raise ValueError(
+            f'Unsupported interpolation {interpolation} '
+            f'(supported: linear, nearest)')
+
+    if transformation_matrix.shape != (ndim + 1, ndim + 1):
+        raise ValueError('Matrix must be have shape (ndim + 1, ndim + 1)')
+
+    texture_object = _create_texture_object(data,
+                                            address_mode=mode,
+                                            filter_mode=interpolation,
+                                            read_mode='element_type',
+                                            border_color=border_value)
+
+    if ndim == 2:
+        kernel = _affine_transform_2d_array_kernel
+    else:
+        kernel = _affine_transform_3d_array_kernel
+
+    if output_shape is None:
+        output_shape = data.shape
+
+    if output is None:
+        output = cupy.zeros(output_shape, dtype=dtype)
+    elif isinstance(output, (type, cupy.dtype)):
+        if output != cupy.float32:
+            raise ValueError(f'Texture memory affine transformation is '
+                             f'available only for float32 data type (not '
+                             f'{output})')
+        output = cupy.zeros(output_shape, dtype=output)
+    elif isinstance(output, cupy.ndarray):
+        if output.shape != output_shape:
+            raise ValueError('Output shapes do not match')
+    else:
+        raise ValueError('Output must be None, cupy.ndarray or cupy.dtype')
+
+    kernel(texture_object, transformation_matrix, *output_shape[1:], output)
+    return output
diff --git a/python/cucim/src/cucim/skimage/_vendored/ndimage.py b/python/cucim/src/cucim/skimage/_vendored/ndimage.py
new file mode 100644
index 000000000..cd97578a8
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/ndimage.py
@@ -0,0 +1,82 @@
+# locally defined filters that are more efficient than in CuPy
+from cucim.skimage._vendored._ndimage_filters import correlate  # NOQA
+from cucim.skimage._vendored._ndimage_filters import convolve  # NOQA
+from cucim.skimage._vendored._ndimage_filters import correlate1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import convolve1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import uniform_filter1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import uniform_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import gaussian_filter1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import gaussian_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import prewitt  # NOQA
+from cucim.skimage._vendored._ndimage_filters import sobel  # NOQA
+from cucim.skimage._vendored._ndimage_filters import generic_laplace  # NOQA
+from cucim.skimage._vendored._ndimage_filters import laplace  # NOQA
+from cucim.skimage._vendored._ndimage_filters import gaussian_laplace  # NOQA
+from cucim.skimage._vendored._ndimage_filters import generic_gradient_magnitude  # NOQA
+from cucim.skimage._vendored._ndimage_filters import gaussian_gradient_magnitude  # NOQA
+from cucim.skimage._vendored._ndimage_filters import minimum_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import maximum_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import minimum_filter1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import maximum_filter1d  # NOQA
+from cucim.skimage._vendored._ndimage_filters import median_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import rank_filter  # NOQA
+from cucim.skimage._vendored._ndimage_filters import percentile_filter  # NOQA
+
+# interpolation
+from cucim.skimage._vendored._ndimage_interpolation import affine_transform  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import map_coordinates  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import rotate  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import shift  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import spline_filter  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import spline_filter1d  # NOQA
+from cucim.skimage._vendored._ndimage_interpolation import zoom  # NOQA
+
+# morphology
+from cucim.skimage._vendored._ndimage_morphology import generate_binary_structure  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import iterate_structure  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_erosion  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_dilation  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_opening  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_closing  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_hit_or_miss  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_fill_holes  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import binary_propagation  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import grey_erosion  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import grey_dilation  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import grey_closing  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import grey_opening  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import morphological_gradient  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import morphological_laplace  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import white_tophat  # NOQA
+from cucim.skimage._vendored._ndimage_morphology import black_tophat  # NOQA
+
+# Import the rest of the cupyx.scipy.ndimage API here
+
+# additional filters
+from cupyx.scipy.ndimage import generic_filter  # NOQA
+from cupyx.scipy.ndimage import generic_filter1d  # NOQA
+
+# fourier filters
+from cupyx.scipy.ndimage import fourier_ellipsoid  # NOQA
+from cupyx.scipy.ndimage import fourier_gaussian  # NOQA
+from cupyx.scipy.ndimage import fourier_shift  # NOQA
+from cupyx.scipy.ndimage import fourier_uniform  # NOQA
+
+# measurements
+from cupyx.scipy.ndimage import label  # NOQA
+try:
+    from cupyx.scipy.ndimage import sum_labels  # NOQA
+except ImportError:
+    from cupyx.scipy.ndimage import sum as sum_labels  # NOQA
+from cupyx.scipy.ndimage import mean  # NOQA
+from cupyx.scipy.ndimage import variance  # NOQA
+from cupyx.scipy.ndimage import standard_deviation  # NOQA
+from cupyx.scipy.ndimage import minimum  # NOQA
+from cupyx.scipy.ndimage import maximum  # NOQA
+from cupyx.scipy.ndimage import minimum_position  # NOQA
+from cupyx.scipy.ndimage import maximum_position  # NOQA
+from cupyx.scipy.ndimage import median  # NOQA
+from cupyx.scipy.ndimage import extrema  # NOQA
+from cupyx.scipy.ndimage import center_of_mass  # NOQA
+from cupyx.scipy.ndimage import histogram  # NOQA
+from cupyx.scipy.ndimage import labeled_comprehension  # NOQA
diff --git a/python/cucim/src/cucim/skimage/_vendored/signaltools.py b/python/cucim/src/cucim/skimage/_vendored/signaltools.py
index c54415e99..3184055fa 100644
--- a/python/cucim/src/cucim/skimage/_vendored/signaltools.py
+++ b/python/cucim/src/cucim/skimage/_vendored/signaltools.py
@@ -424,7 +424,7 @@ def choose_conv_method(in1, in2, mode="full", measure=False):
     returns `direct` (e.g., to protect against floating point integer
     precision).
 
-    .. versionadded:: 0.19
+    .. versionadded:: 22.02.00
 
     Examples
     --------
diff --git a/python/cucim/src/cucim/skimage/exposure/exposure.py b/python/cucim/src/cucim/skimage/exposure/exposure.py
index f7c0eeb4f..2b16508f2 100644
--- a/python/cucim/src/cucim/skimage/exposure/exposure.py
+++ b/python/cucim/src/cucim/skimage/exposure/exposure.py
@@ -29,9 +29,7 @@ def _offset_array(arr, low_boundary, high_boundary):
             # prevent overflow errors when offsetting
             arr = arr.astype(offset_dtype)
         arr = arr - offset
-    else:
-        offset = 0
-    return arr, offset
+    return arr
 
 
 def _bincount_histogram_centers(image, source_range):
@@ -75,8 +73,10 @@ def _bincount_histogram(image, source_range, bin_centers=None):
     if bin_centers is None:
         bin_centers = _bincount_histogram_centers(image, source_range)
     image_min, image_max = bin_centers[0], bin_centers[-1]
-    image, offset = _offset_array(image, image_min.item(), image_max.item())  # synchronize  # noqa
-    hist = cp.bincount(image.ravel(), minlength=image_max - image_min + 1)
+    image = _offset_array(image, image_min.item(), image_max.item())  # synchronize  # noqa
+    hist = cp.bincount(
+        image.ravel(), minlength=image_max - min(image_min, 0) + 1
+    )
     if source_range == 'image':
         idx = max(image_min, 0)
         hist = hist[idx:]
diff --git a/python/cucim/src/cucim/skimage/exposure/tests/test_exposure.py b/python/cucim/src/cucim/skimage/exposure/tests/test_exposure.py
index f27c8fff7..c0f6792ff 100644
--- a/python/cucim/src/cucim/skimage/exposure/tests/test_exposure.py
+++ b/python/cucim/src/cucim/skimage/exposure/tests/test_exposure.py
@@ -21,7 +21,9 @@
 def test_wrong_source_range():
     im = cp.array([-1, 100], dtype=cp.int8)
     with pytest.raises(ValueError):
-        frequencies, bin_centers = exposure.histogram(im, source_range="foobar")
+        frequencies, bin_centers = exposure.histogram(
+            im, source_range="foobar"
+        )
 
 
 def test_negative_overflow():
@@ -50,6 +52,15 @@ def test_int_range_image():
     assert bin_centers[-1] == 100
 
 
+def test_multichannel_int_range_image():
+    im = cp.array([[10, 5], [100, 102]], dtype=np.int8)
+    frequencies, bin_centers = exposure.histogram(im, channel_axis=-1)
+    for ch in range(im.shape[-1]):
+        assert len(frequencies[ch]) == len(bin_centers)
+    assert bin_centers[0] == 5
+    assert bin_centers[-1] == 102
+
+
 def test_peak_uint_range_dtype():
     im = cp.array([10, 100], dtype=cp.uint8)
     frequencies, bin_centers = exposure.histogram(im, source_range="dtype")
@@ -286,11 +297,7 @@ def test_rescale_in_range_clip():
 def test_rescale_out_range(dtype):
     """Check that output range is correct.
 
-    .. versionchanged:: 0.17
-        This function used to return dtype matching the input dtype. It now
-        matches the output.
-
-    .. versionchanged:: 0.19
+    .. versionchanged:: 22.02.00
         float16 and float32 inputs now result in float32 output. Formerly they
         would give float64 outputs.
     """
diff --git a/python/cucim/src/cucim/skimage/feature/_basic_features.py b/python/cucim/src/cucim/skimage/feature/_basic_features.py
index 6953b846d..9807906d7 100644
--- a/python/cucim/src/cucim/skimage/feature/_basic_features.py
+++ b/python/cucim/src/cucim/skimage/feature/_basic_features.py
@@ -6,6 +6,7 @@
 import cupy as cp
 import numpy as np
 
+from .._shared._gradient import gradient
 from cucim.skimage import feature, filters
 from cucim.skimage._shared import utils
 from cucim.skimage.util import img_as_float32
@@ -14,7 +15,7 @@
 def _texture_filter(gaussian_filtered):
     combos = combinations_with_replacement
     H_elems = [
-        cp.gradient(cp.gradient(gaussian_filtered)[ax0], axis=ax1)
+        gradient(gradient(gaussian_filtered)[ax0], axis=ax1)
         for ax0, ax1 in combos(range(gaussian_filtered.ndim), 2)
     ]
     eigvals = feature.hessian_matrix_eigvals(H_elems)
diff --git a/python/cucim/src/cucim/skimage/feature/_canny.py b/python/cucim/src/cucim/skimage/feature/_canny.py
index b9d0e7c80..56d0f5cab 100644
--- a/python/cucim/src/cucim/skimage/feature/_canny.py
+++ b/python/cucim/src/cucim/skimage/feature/_canny.py
@@ -12,7 +12,7 @@
 Original author: Lee Kamentsky
 """
 import cupy as cp
-import cupyx.scipy.ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from cucim.skimage.util import dtype_limits
 
@@ -201,7 +201,7 @@ def _generate_nonmaximum_suppression_op(large_int=False):
 @cp.memoize(for_each_device=True)
 def _get_nonmax_kernel(large_int=False):
     in_params = ('raw T isobel, raw T jsobel, raw T magnitude, '
-                 'raw uint8 eroded_mask, T low_threshold')
+                 'raw uint8 eroded_mask, float64 low_threshold')
     out_params = 'T out'
     name = 'cupyx_skimage_canny_nonmaximum_suppression'
     if large_int:
diff --git a/python/cucim/src/cucim/skimage/feature/corner.py b/python/cucim/src/cucim/skimage/feature/corner.py
index 65b90315f..f52f98ad7 100644
--- a/python/cucim/src/cucim/skimage/feature/corner.py
+++ b/python/cucim/src/cucim/skimage/feature/corner.py
@@ -3,12 +3,13 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 from scipy import spatial  # TODO: use RAPIDS cuSpatial?
 
 from cucim.skimage.util import img_as_float
 
 # from ..transform import integral_image
+from .._shared._gradient import gradient
 from .._shared.utils import _supported_float_type
 from .peak import peak_local_max
 from .util import _prepare_grayscale_input_nD
@@ -217,14 +218,14 @@ def hessian_matrix(image, sigma=1, mode='constant', cval=0, order='rc'):
     gaussian_filtered = gaussian(image, sigma=sigma, mode=mode, cval=cval,
                                  channel_axis=channel_axis)
 
-    gradients = cp.gradient(gaussian_filtered)
+    gradients = gradient(gaussian_filtered)
     axes = range(image.ndim)
 
     if order == "rc":
         axes = reversed(axes)
 
     H_elems = [
-        cp.gradient(gradients[ax0], axis=ax1)
+        gradient(gradients[ax0], axis=ax1)
         for ax0, ax1 in combinations_with_replacement(axes, 2)
     ]
 
diff --git a/python/cucim/src/cucim/skimage/feature/peak.py b/python/cucim/src/cucim/skimage/feature/peak.py
index eff5f6735..fbb2570f7 100644
--- a/python/cucim/src/cucim/skimage/feature/peak.py
+++ b/python/cucim/src/cucim/skimage/feature/peak.py
@@ -1,7 +1,7 @@
 from warnings import warn
 
 import cupy as cp
-import cupyx.scipy.ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 import numpy as np
 from scipy.ndimage import find_objects as cpu_find_objects
 
diff --git a/python/cucim/src/cucim/skimage/feature/tests/test_canny.py b/python/cucim/src/cucim/skimage/feature/tests/test_canny.py
index a10006b95..6e06114fc 100644
--- a/python/cucim/src/cucim/skimage/feature/tests/test_canny.py
+++ b/python/cucim/src/cucim/skimage/feature/tests/test_canny.py
@@ -1,6 +1,5 @@
-import unittest
-
 import cupy as cp
+import pytest
 from cupy.testing import assert_array_equal
 from cupyx.scipy.ndimage import binary_dilation, binary_erosion
 from skimage import data
@@ -9,18 +8,18 @@
 from cucim.skimage.util import img_as_float
 
 
-class TestCanny(unittest.TestCase):
+class TestCanny():
     def test_00_00_zeros(self):
         """Test that the Canny filter finds no points for a blank field"""
         result = feature.canny(cp.zeros((20, 20)), 4, 0, 0, cp.ones((20, 20),
                                bool))
-        self.assertFalse(cp.any(result))
+        assert not cp.any(result)
 
     def test_00_01_zeros_mask(self):
         """Test that the Canny filter finds no points in a masked image"""
         result = (feature.canny(cp.random.uniform(size=(20, 20)), 4, 0, 0,
                                 cp.zeros((20, 20), bool)))
-        self.assertFalse(cp.any(result))
+        assert not cp.any(result)
 
     def test_01_01_circle(self):
         """Test that the Canny filter finds the outlines of a circle"""
@@ -36,7 +35,7 @@ def test_01_01_circle(self):
         cd = binary_dilation(c, iterations=3, brute_force=True)
         ce = binary_erosion(c, iterations=3, brute_force=True)
         cde = cp.logical_and(cd, cp.logical_not(ce))
-        self.assertTrue(cp.all(cde[result]))
+        assert cp.all(cde[result])
         #
         # The circle has a radius of 100. There are two rings here, one
         # for the inside edge and one for the outside. So that's
@@ -44,8 +43,8 @@ def test_01_01_circle(self):
         # The edge contains both pixels if there's a tie, so we
         # bump the count a little.
         point_count = cp.sum(result)
-        self.assertTrue(point_count > 1200)
-        self.assertTrue(point_count < 1600)
+        assert point_count > 1200
+        assert point_count < 1600
 
     def test_01_02_circle_with_noise(self):
         """Test that the Canny filter finds the circle outlines
@@ -62,24 +61,30 @@ def test_01_02_circle_with_noise(self):
         cd = binary_dilation(c, iterations=4, brute_force=True)
         ce = binary_erosion(c, iterations=4, brute_force=True)
         cde = cp.logical_and(cd, cp.logical_not(ce))
-        self.assertTrue(cp.all(cde[result]))
+        assert cp.all(cde[result])
         point_count = cp.sum(result)
-        self.assertTrue(point_count > 1200)
-        self.assertTrue(point_count < 1600)
+        assert point_count > 1200
+        assert point_count < 1600
 
     def test_image_shape(self):
-        self.assertRaises(ValueError, feature.canny, cp.zeros((20, 20, 20)), 4,
-                          0, 0)
+        with pytest.raises(ValueError):
+            feature.canny(cp.zeros((20, 20, 20)), 4, 0, 0)
 
     def test_mask_none(self):
         result1 = feature.canny(cp.zeros((20, 20)), 4, 0, 0, cp.ones((20, 20),
                                 bool))
         result2 = feature.canny(cp.zeros((20, 20)), 4, 0, 0)
-        self.assertTrue(cp.all(result1 == result2))
+        assert cp.all(result1 == result2)
 
     @cp.testing.with_requires("scikit-image>=0.18")
-    def test_use_quantiles(self):
-        image = img_as_float(cp.asarray(data.camera()[::100, ::100]))
+    @pytest.mark.parametrize('image_dtype', [cp.uint8, cp.int64, cp.float32,
+                                             cp.float64])
+    def test_use_quantiles(self, image_dtype):
+        dtype = cp.dtype(image_dtype)
+        image = cp.asarray(data.camera()[::100, ::100])
+        if dtype.kind == 'f':
+            image = img_as_float(image)
+        image = image.astype(dtype)
 
         # Correct output produced manually with quantiles
         # of 0.8 and 0.6 for high and low respectively
@@ -96,24 +101,33 @@ def test_use_quantiles(self):
 
         assert_array_equal(result, correct_output)
 
+    def test_img_all_ones(self):
+        image = cp.ones((10, 10))
+        assert cp.all(feature.canny(image) == 0)
+
     def test_invalid_use_quantiles(self):
         image = img_as_float(cp.array(data.camera()[::50, ::50]))
 
-        self.assertRaises(ValueError, feature.canny, image, use_quantiles=True,
+        with pytest.raises(ValueError):
+            feature.canny(image, use_quantiles=True,
                           low_threshold=0.5, high_threshold=3.6)
 
-        self.assertRaises(ValueError, feature.canny, image, use_quantiles=True,
+        with pytest.raises(ValueError):
+            feature.canny(image, use_quantiles=True,
                           low_threshold=-5, high_threshold=0.5)
 
-        self.assertRaises(ValueError, feature.canny, image, use_quantiles=True,
+        with pytest.raises(ValueError):
+            feature.canny(image, use_quantiles=True,
                           low_threshold=99, high_threshold=0.9)
 
-        self.assertRaises(ValueError, feature.canny, image, use_quantiles=True,
+        with pytest.raises(ValueError):
+            feature.canny(image, use_quantiles=True,
                           low_threshold=0.5, high_threshold=-100)
 
         # Example from issue #4282
         image = data.camera()
-        self.assertRaises(ValueError, feature.canny, image, use_quantiles=True,
+        with pytest.raises(ValueError):
+            feature.canny(image, use_quantiles=True,
                           low_threshold=50, high_threshold=150)
 
     def test_dtype(self):
diff --git a/python/cucim/src/cucim/skimage/filters/_gabor.py b/python/cucim/src/cucim/skimage/filters/_gabor.py
index e55720357..df4c041c4 100644
--- a/python/cucim/src/cucim/skimage/filters/_gabor.py
+++ b/python/cucim/src/cucim/skimage/filters/_gabor.py
@@ -1,7 +1,8 @@
 import math
 
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
+
 
 from .._shared.utils import _supported_float_type, check_nD, warn
 
diff --git a/python/cucim/src/cucim/skimage/filters/_median.py b/python/cucim/src/cucim/skimage/filters/_median.py
index d0da048c9..2041f4161 100644
--- a/python/cucim/src/cucim/skimage/filters/_median.py
+++ b/python/cucim/src/cucim/skimage/filters/_median.py
@@ -1,15 +1,27 @@
 from warnings import warn
 
+import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
 
+import cucim.skimage._vendored.ndimage as ndi
 from .._shared.utils import deprecate_kwarg
+from ._median_hist import _can_use_histogram, _median_hist, KernelResourceError
 
+try:
+    from math import prod
+except ImportError:
+    from functools import reduce
+    from operator import mul
 
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+    def prod(x):
+        return reduce(mul, x)
+
+
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def median(image, footprint=None, out=None, mode='nearest', cval=0.0,
-           behavior='ndimage'):
+           behavior='ndimage', *, algorithm='auto', algorithm_kwargs={}):
     """Return local median of an image.
 
     Parameters
@@ -47,6 +59,20 @@ def median(image, footprint=None, out=None, mode='nearest', cval=0.0,
         .. versionchanged:: 0.16
            Default ``behavior`` has been changed from 'rank' to 'ndimage'
 
+    Other Parameters
+    ----------------
+    algorithm : {'auto', 'histogram', 'sorting'}
+        Determines which algorithm is used to compute the median. The default
+        of 'auto' will attempt to use a histogram-based algorithm for 2D
+        images with 8 or 16-bit integer data types. Otherwise a sorting-based
+        algorithm will be used. Note: this paramter is cuCIM-specific and does
+        not exist in upstream scikit-image.
+    algorithm_kwargs : dict
+        Any additional algorithm-specific keywords. Currently can only be used
+        to set the number of parallel partitions for the 'histogram' algorithm.
+        (e.g. ``algorithm_kwargs={'partitions': 256}``). Note: this paramter is
+        cuCIM-specific and does not exist in upstream scikit-image.
+
     Returns
     -------
     out : 2-D array (same dtype as input image)
@@ -58,6 +84,22 @@ def median(image, footprint=None, out=None, mode='nearest', cval=0.0,
         filtering offering more flexibility with additional parameters but
         dedicated for unsigned integer images.
 
+    Notes
+    -----
+    An efficient, histogram-based median filter as described in [1]_ is faster
+    than the sorting based approach for larger kernel sizes (e.g. greater than
+    13x13 or so in 2D). It has near-constant run time regardless of the kernel
+    size. The algorithm presented in [1]_ has been adapted to additional bit
+    depths here. When algorithm='auto', the histogram-based algorithm will be
+    chosen for integer-valued images with sufficiently large footprint size.
+    Otherwise, the sorting-based approach is used.
+
+    References
+    ----------
+    .. [1] O. Green, "Efficient Scalable Median Filtering Using Histogram-Based
+       Operations," in IEEE Transactions on Image Processing, vol. 27, no. 5,
+       pp. 2217-2228, May 2018, https://doi.org/10.1109/TIP.2017.2781375.
+
     Examples
     --------
     >>> import cupy as cp
@@ -79,5 +121,65 @@ def median(image, footprint=None, out=None, mode='nearest', cval=0.0,
 
     if footprint is None:
         footprint = ndi.generate_binary_structure(image.ndim, image.ndim)
+
+    if algorithm == 'sorting':
+        can_use_histogram = False
+    elif algorithm in ['auto', 'histogram']:
+        can_use_histogram, reason = _can_use_histogram(image, footprint)
+    else:
+        raise ValueError(f"unknown algorithm: {algorithm}")
+
+    if algorithm == 'histogram' and not can_use_histogram:
+        raise ValueError(
+            "The histogram-based algorithm was requested, but it cannot "
+            f"be used for this image and footprint (reason: {reason})."
+        )
+
+    # The sorting-based implementation in CuPy is faster for small footprints.
+    # Empirically, shapes above (13, 13) and above on RTX A6000 have faster
+    # execution for the histogram-based approach.
+    use_histogram = can_use_histogram
+    if algorithm == 'auto':
+        # prefer sorting-based algorithm if footprint shape is small
+        use_histogram = use_histogram and prod(footprint.shape) > 150
+
+    if use_histogram:
+        try:
+            # as in SciPy, a user-provided `out` can be an array or a dtype
+            output_array_provided = False
+            out_dtype = None
+            if out is not None:
+                output_array_provided = isinstance(out, cp.ndarray)
+                if not output_array_provided:
+                    try:
+                        out_dtype = cp.dtype(out)
+                    except TypeError:
+                        raise TypeError(
+                            "out must be either a cupy.array or a valid input "
+                            "to cupy.dtype"
+                        )
+
+            # TODO: Can't currently pass an output array into _median_hist as a
+            #       new array currently needs to be created during padding.
+            temp = _median_hist(image, footprint, mode=mode, cval=cval,
+                                **algorithm_kwargs)
+            if output_array_provided:
+                out[:] = temp
+            else:
+                if out_dtype is not None:
+                    temp = temp.astype(out_dtype, copy=False)
+                out = temp
+            return out
+        except KernelResourceError as e:
+            # Fall back to sorting-based implementation if we encounter a
+            # resource limit (e.g. insufficient shared memory per block).
+            warn("Kernel resource error encountered in histogram-based "
+                 f"median kerne: {e}\n"
+                 "Falling back to sorting-based median instead.")
+
+    if algorithm_kwargs:
+        warn(f"algorithm_kwargs={algorithm_kwargs} ignored for sorting-based "
+             f"algorithm")
+
     return ndi.median_filter(image, footprint=footprint, output=out, mode=mode,
                              cval=cval)
diff --git a/python/cucim/src/cucim/skimage/filters/_median_hist.py b/python/cucim/src/cucim/skimage/filters/_median_hist.py
new file mode 100644
index 000000000..0700238c0
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/filters/_median_hist.py
@@ -0,0 +1,515 @@
+import math
+import os
+from collections import namedtuple
+from textwrap import dedent
+
+import cupy as cp
+import numpy as np
+
+from .._shared.utils import _to_np_mode
+
+if hasattr(math, 'prod'):
+    prod = math.prod
+else:
+    prod = np.prod
+
+
+def _dtype_to_CUDA_int_type(dtype):
+    cpp_int_types = {
+        cp.uint8: 'unsigned char',
+        cp.uint16: 'unsigned short',
+        cp.uint32: 'unsigned int',
+        cp.uint64: 'unsigned long long',
+        cp.int8: 'signed char',
+        cp.int16: 'short',
+        cp.int32: 'int',
+        cp.int64: 'long long',
+    }
+    dtype = cp.dtype(dtype)
+    if dtype.type not in cpp_int_types:
+        raise ValueError(f"unrecognized dtype: {dtype.type}")
+    return cpp_int_types[dtype.type]
+
+
+def _get_hist_dtype(footprint_shape):
+    """Determine C++ type and cupy.dtype to use for the histogram."""
+    max_possible_count = prod(footprint_shape)
+
+    if max_possible_count < 128:
+        dtype = cp.int8
+    elif max_possible_count < 32768:
+        dtype = cp.int16
+    else:
+        dtype = cp.int32
+    return _dtype_to_CUDA_int_type(dtype), dtype
+
+
+def _gen_global_definitions(
+    image_t='unsigned char',
+    hist_offset=0,
+    hist_int_t='int',
+    hist_size=256,
+    hist_size_coarse=8
+):
+    """Generate C++ #define statements needed for the CUDA kernels.
+
+    The definitions used depend on the number of histogram bins and the
+    histogram data type.
+    """
+
+    if hist_size % hist_size_coarse != 0:
+        raise ValueError(
+            "`hist_size` must be a multiple of `hist_size_coarse`"
+        )
+    hist_size_fine = hist_size // hist_size_coarse
+    log2_coarse = math.log2(hist_size_coarse)
+    log2_fine = math.log2(hist_size_fine)
+    if abs(math.remainder(log2_coarse, 1)) > 1e-7:
+        raise ValueError("log2_coarse must be a power of two")
+    elif abs(math.remainder(log2_fine, 1)) > 1e-7:
+        raise ValueError("log2_fine must be a power of two")
+    else:
+        log2_coarse = round(log2_coarse)
+        log2_fine = round(log2_fine)
+
+    global_defs = f"""
+#define HIST_SIZE {hist_size}
+#define HIST_SIZE_COARSE {hist_size_coarse}
+#define HIST_SIZE_FINE {hist_size_fine}
+#define HIST_INT_T {hist_int_t}
+#define HIST_OFFSET {hist_offset}
+#define IMAGE_T {image_t}
+#define LOG2_COARSE {log2_coarse}
+#define LOG2_FINE {log2_fine}
+    """
+    return global_defs
+
+
+# TODO: look into using CUB's Block-wise collectives (e.g. BlockScan)
+def _gen_preamble_median(hist_size_coarse, hist_size_fine):
+    """Generate bin size-dependent reduction functions.
+
+    This helper generates the C++ code for the following two functions.
+
+    histogramMedianParCoarseLookupOnly
+    histogramMedianParFineLookupOnly
+
+    For each of these the number of calls to scanNeighbor is equal to
+    log2 of the number of bins.
+    """
+    n_log2_coarse = math.log2(hist_size_coarse)
+    if hist_size_coarse < 2 or n_log2_coarse % 1.0 != 0:
+        raise ValueError("hist_size_coarse must be a positive power of 2")
+
+    n_log2_fine = math.log2(hist_size_fine)
+    if hist_size_fine < 2 or n_log2_fine % 1.0 != 0:
+        raise ValueError("hist_size_fine must be a positive power of 2")
+
+    ops = """
+
+        #define scanNeighbor(array, range, index, threadIndex)         \\
+            {                                                          \\
+                HIST_INT_T v = 0;                                      \\
+                if (index <= threadIndex && threadIndex < range)       \\
+                    v = array[threadIndex] + array[threadIndex-index]; \\
+                __syncthreads();                                       \\
+                if (index <= threadIndex && threadIndex < range)       \\
+                    array[threadIndex] = v;                            \\
+            }
+
+        #define findMedian(array, range, threadIndex, result, count, position)         \\
+            if (threadIndex < range)                                                   \\
+            {                                                                          \\
+                if (array[threadIndex+1] > position && array[threadIndex] <= position) \\
+                {                                                                      \\
+                    *result = threadIndex+1;                                           \\
+                    *count  = array[threadIndex];                                      \\
+                }                                                                      \\
+            }
+
+        __device__ void histogramMedianParCoarseLookupOnly(HIST_INT_T* H, HIST_INT_T* Hscan, const int medPos, int* retval, int* countAtMed){
+            int tx=threadIdx.x;
+            *retval=*countAtMed=0;
+            if(tx<HIST_SIZE_COARSE){
+                Hscan[tx]=H[tx];
+            }
+            __syncthreads();\n"""  # noqa
+
+    for d in range(round(n_log2_coarse)):
+        ops += f"""
+            scanNeighbor(Hscan, {hist_size_coarse}, {2**d}, tx);
+            __syncthreads();"""
+    ops += f"""
+            findMedian(Hscan, {hist_size_coarse - 1}, tx, retval, countAtMed, medPos);
+        }}"""  # noqa
+
+    ops += """
+
+        __device__ void histogramMedianParFineLookupOnly(HIST_INT_T* H, HIST_INT_T* Hscan, const int medPos, int* retval, int* countAtMed){
+            int tx=threadIdx.x;
+            *retval=*countAtMed=0;
+            if(tx<HIST_SIZE_FINE){
+                Hscan[tx]=H[tx];
+            }
+            __syncthreads();\n"""  # noqa
+
+    for d in range(round(n_log2_fine)):
+        ops += f"""
+            scanNeighbor(Hscan, {hist_size_fine}, {2**d}, tx);
+            __syncthreads();"""
+    ops += f"""
+            findMedian(Hscan, {hist_size_fine - 1}, tx, retval, countAtMed, medPos);
+        }}\n"""  # noqa
+
+    return dedent(ops)
+
+
+def _gen_median_kernel_preamble(
+    image_t, hist_offset, hist_int_t, hist_size=256, hist_size_coarse=8
+):
+    src = _gen_global_definitions(
+        image_t=image_t,
+        hist_offset=hist_offset,
+        hist_int_t=hist_int_t,
+        hist_size=hist_size,
+        hist_size_coarse=hist_size_coarse,
+    )
+
+    hist_size_fine = hist_size // hist_size_coarse
+    src += _gen_preamble_median(
+        hist_size_coarse=hist_size_coarse, hist_size_fine=hist_size_fine
+    )
+    return src
+
+
+@cp.memoize(for_each_device=True)
+def _get_median_rawkernel(
+    image_t, hist_offset, hist_int_t, hist_size=256, hist_size_coarse=8
+):
+    preamble = _gen_median_kernel_preamble(
+        image_t=image_t,
+        hist_offset=hist_offset,
+        hist_int_t=hist_int_t,
+        hist_size=hist_size,
+        hist_size_coarse=hist_size_coarse,
+    )
+
+    kernel_directory = os.path.join(os.path.dirname(__file__), 'cuda')
+    with open(os.path.join(kernel_directory, 'histogram_median.cu'), 'rt') as f:
+        rank_filter_kernel = '\n'.join(f.readlines())
+
+    return cp.RawKernel(
+        code=preamble + rank_filter_kernel,
+        name="cuRankFilterMultiBlock",
+    )
+
+
+def _check_shared_memory_requirement_bytes(
+    hist_dtype, hist_size_coarse, hist_size_fine
+):
+    """computes amount of shared memory required by cuRankFilterMultiBlock"""
+    s = np.dtype(hist_dtype).itemsize
+    shared_size = hist_size_coarse * s                     # for HCoarse
+    shared_size += hist_size_fine * s                      # for HCoarseScane
+    shared_size += hist_size_coarse * hist_size_fine * s   # for HFine
+    shared_size += hist_size_coarse * 4                    # for luc
+    shared_size += 12                                      # three more ints
+    return shared_size
+
+
+def _check_global_scratch_space_size(
+    image_shape, hist_size, hist_size_coarse, hist_dtype, partitions
+):
+    """Determine amount of histogram scratch space that will be allocated.
+
+    Returns the total size in bytes.
+    """
+    n_last = image_shape[-1]  # this is the contiguous memory dimension
+    n_fine = n_last * hist_size * partitions
+    n_coarse = n_last * hist_size_coarse * partitions
+    return (n_fine + n_coarse) * cp.dtype(hist_dtype).itemsize
+
+
+def _can_use_histogram(image, footprint):
+    """Validate compatibility with histogram-based median.
+
+    Parameters
+    ----------
+    image : cupy.ndarray
+        The image to filter.
+    footprint : cupy.ndarray
+        The filter footprint.
+
+    Returns
+    -------
+    compatible : bool
+        Indicates whether the provided image and footprint are compatible with
+        the histogram-based median.
+    reason : str
+        Description of the reason for the incompatibility
+    """
+    # only 2D 8 and 16-bit unsigned integer images are supported
+    if image.ndim != 2:
+        return False, "only 2D images are supported"
+    if image.dtype not in [cp.uint8, cp.uint16, cp.int8, cp.int16]:
+        return False, "Only 8 and 16-bit integer image types (signed or "
+        "unsigned)."
+
+    # only odd-sized footprints are supported
+    if not all(s % 2 == 1 for s in footprint.shape):
+        return False, "footprint must have odd size on both axes"
+
+    if any(s == 1 for s in footprint.shape):
+        return False, "footprint must have size >= 3"
+
+    # footprint radius can't be larger than the image
+    # TODO: need to check if we need this exact restriction
+    #       (may be specific to OpenCV's boundary handling)
+    radii = tuple(s // 2 for s in footprint.shape)
+    if any(r > s for r, s in zip(radii, image.shape)):
+        return False, "footprint half-width cannot exceed the image extent"
+
+    # only fully populated footprint is supported
+    if not np.all(footprint):  # synchronizes!
+        return False, "footprint must be 1 everywhere"
+
+    return True, None
+
+
+class KernelResourceError(RuntimeError):
+    pass
+
+
+def _get_kernel_params(image, footprint_shape, value_range='auto',
+                       partitions=None, hist_size_coarse=None):
+    """Determine kernel launch parameters and #define values for its code.
+
+    Parameters
+    ----------
+    image : cupy.ndarray
+        The histogram bin range will depend on the image dtype unless specified
+        explicitly via `value_range`
+    footprint_shape : tuple of int
+        The shape of the footprint. The dtype used for storing the histogram
+        will depend on the footprint size. For small footprints, histograms
+        will be stored using int8, otherwise int16 will be used.
+    value_range : {'auto', 'dtype', 'image'}, optional
+        When value_range='dtype', the range will be determined based on the
+        maximal range of the data type. When ``value_range='image'``, the
+        minimum and maximum intensities present in the image will be used. When
+        set to auto 'auto', `dtype` is used for 8-bit images and otherwise
+        'image' is used.
+    partitions : positive int, optional
+        The grid size used during kernel launch will be (partitions, 1, 1).
+        Increasing this will increase parallelism (and thus performance), but
+        at cost of additional GPU memory usage. Will be automatically truncated
+        to a value no larger than image.shape[0] // 2.
+    hist_size_coarse : int or None, optional
+        Can be used to override the default choice of the number of coarse
+        histogram bins. It is not generally recommended to set this as
+        infeasible values can easily be chosen. Using None, will give
+        automatically selected values that have been validated in previous
+        testing.
+
+    Returns
+    -------
+    CUDAParams : namedtuple
+        Various parameters used in kernel code generation and at launch time.
+        See comments next to the KernelParams declaration below for details.
+    """
+
+    if value_range == 'auto':
+        if image.dtype.itemsize < 2:
+            value_range = 'dtype'
+        else:
+            # to save memory, try using actual value range for >8-bit images
+            # (e.g. DICOM images often have 12-bit range)
+            value_range = 'image'
+
+    if value_range == 'dtype':
+        if image.dtype.itemsize > 2:
+            raise ValueError(
+                "dtype range only supported for 8 and 16-bit integer dtypes."
+            )
+        iinfo = cp.iinfo(image.dtype)
+        minv, maxv = iinfo.min, iinfo.max
+    elif value_range == 'image':
+        minv = int(image.min())
+        maxv = int(image.max())
+    else:
+        if len(value_range) != 2:
+            raise ValueError(
+                "value_range must be either 'dtype', 'image' or a "
+                "(min, max) sequence."
+            )
+        minv, maxv = value_range
+
+    if image.dtype.kind == 'u':
+        # cannot subtract a positive offset in the unsigned case
+        minv = min(minv, 0)
+    hist_offset = 0 if minv == 0 else -minv
+    hist_size = maxv - minv + 1
+    hist_size = max(hist_size, 256)  # use at least 256 bins
+    # round hist_size up to the nearest power of 2
+    hist_size = round(2**math.ceil(math.log2(hist_size)))
+    hist_size = max(hist_size, 32)
+
+    if hist_size_coarse is None:
+        # Empirically, robust to choose 32-fold less bins for hist_size coarse
+        hist_size_coarse = hist_size // 32
+
+    # have to set block[0] large enough that histogramMedianParFineLookupOnly
+    # and histogramMedianParCoarseLookupOnly search sizes fit within the number
+    # of threads in the block.
+    # Use the maximum of the coarse and fine sizes, rounded up to the nearest
+    # multiple of 32.
+    hist_size_fine = hist_size // hist_size_coarse
+    hist_size_max = max(hist_size_fine, hist_size_coarse)
+    # block0 must be at least the warp size
+    block0 = 32 * math.ceil(hist_size_max / 32)
+    if block0 > 256:
+        d = cp.cuda.Device()
+        max_block_x = d.attributes["MaxBlockDimX"]
+        if block0 > max_block_x:
+            raise KernelResourceError(
+                f"The requested block size of {block0} for the first dimension"
+                f", exceeds MaxBlockDimX={max_block_x} for this device."
+            )
+
+    if partitions is None:
+        # Substantial overhead in computing the first line, so need at least
+        # two lines per partition for best performance. Limit to bins_max
+        # partitions by default to avoid overly excessive memory overhead.
+        bins_max = max(16, 512 // (hist_size // 256))
+        partitions = min(image.shape[0] // 2, bins_max)
+    else:
+        # cannot exceed the number of image rows
+        partitions = min(partitions, image.shape[0])
+
+    grid = (partitions, 1, 1)
+    block = (block0, 1, 1)
+
+    hist_int_t, hist_dtype = _get_hist_dtype(footprint_shape)
+
+    # All recent GPUs (CC>=3.5) allow at least 48k of shared memory per block,
+    # so don't bother checking the requirements unless thousands of histogram
+    # bins are requested.
+    # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications  # noqa
+    if hist_size >= 8192:
+        smem_size = _check_shared_memory_requirement_bytes(
+            hist_dtype, hist_size_coarse, hist_size_fine
+        )
+        d = cp.cuda.Device()
+        smem_available = d.attributes['MaxSharedMemoryPerBlock']
+        if smem_size > smem_available:
+            raise KernelResourceError(
+                f"Shared memory requirement of {smem_size} bytes per block"
+                f"exceeds the device limit of {smem_available}."
+            )
+    CUDAParams = namedtuple(
+        'HistogramMedianKernelParams',
+        [
+            'grid',
+            'block',
+            'hist_size',           # total number of histogram bins
+            'hist_size_coarse',    # number of coarse-level histogram bins
+            'hist_dtype',          # cupy.dtype of the histogram
+            'hist_int_t',          # C++ type of the histogram
+            'hist_offset',         # offset from 0 for the first bin
+            'partitions'           # number of parallel bands to use
+        ]
+    )
+    return CUDAParams(
+        grid,
+        block,
+        hist_size,
+        hist_size_coarse,
+        hist_dtype,
+        hist_int_t,
+        hist_offset,
+        partitions,
+    )
+
+
+def _median_hist(image, footprint, output=None, mode='mirror', cval=0,
+                 value_range='auto', partitions=None):
+
+    if output is not None:
+        raise NotImplementedError(
+            "Use of a user-defined output array has not been implemented"
+        )
+
+    compatible_image, reason = _can_use_histogram(image, footprint)
+    if not compatible_image:
+        raise ValueError(reason)
+
+    # kernel pointer offset calculations assume C-contiguous image data
+    image = cp.ascontiguousarray(image)
+    n_rows, n_cols = image.shape[:2]
+    if image.dtype.kind == 'b':
+        image = image.view(cp.uint8)
+    if image.dtype.kind not in 'iu':
+        raise ValueError("only integer-type images are accepted")
+
+    radii = tuple(s // 2 for s in footprint.shape)
+    # med_pos is the index corresponding to the median
+    # (calculation here assumes all elements of the footprint are True)
+    med_pos = footprint.size // 2
+
+    params = _get_kernel_params(
+        image, footprint.shape, value_range, partitions
+    )
+
+    # pad as necessary to avoid boundary artifacts
+    # Don't have to pad along axis 0 if mode is already 'nearest' because the
+    # kernel already assumes 'nearest' mode internally.
+    autopad = True
+    pad_both_axes = mode != 'nearest'
+    if autopad:
+        if pad_both_axes:
+            npad = tuple((r, r) for r in radii)
+        else:
+            npad = ((0, 0),) * (image.ndim - 1) + ((radii[-1], radii[-1]),)
+        mode = _to_np_mode(mode)
+        if mode == 'constant':
+            pad_kwargs = dict(mode=mode, constant_values=cval)
+        else:
+            pad_kwargs = dict(mode=mode)
+        image = cp.pad(image, npad, **pad_kwargs)
+        # must update n_rows, n_cols after padding!
+        n_rows, n_cols = image.shape[:2]
+
+    # generate the kernel
+    kern = _get_median_rawkernel(
+        image_t=_dtype_to_CUDA_int_type(image.dtype),
+        hist_offset=params.hist_offset,
+        hist_int_t=params.hist_int_t,
+        hist_size=params.hist_size,
+        hist_size_coarse=params.hist_size_coarse,
+    )
+
+    # allocate output and scratch space, `hist` and `coarse_hist`.
+    out = cp.empty_like(image)
+    hist = cp.zeros(
+        (n_cols * params.hist_size * params.partitions,),
+        params.hist_dtype,
+    )
+    coarse_hist = cp.zeros(
+        (n_cols * params.hist_size_coarse * params.partitions,),
+        params.hist_dtype,
+    )
+
+    # call the kernel
+    r0, r1 = radii[:2]
+    s0, s1 = image.shape[:2]
+    kernel_args = (image, out, hist, coarse_hist, r0, r1, med_pos, s0, s1)
+    kern(params.grid, params.block, kernel_args)
+
+    # remove any padding that was added
+    if autopad:
+        if pad_both_axes:
+            out_sl = tuple(slice(r, -r) for r in radii)
+            out = out[out_sl]
+        else:
+            out = out[..., radii[-1]:-radii[-1]]
+    return out
diff --git a/python/cucim/src/cucim/skimage/filters/_separable_filtering.py b/python/cucim/src/cucim/skimage/filters/_separable_filtering.py
new file mode 100644
index 000000000..eb8dad28a
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/filters/_separable_filtering.py
@@ -0,0 +1,990 @@
+import math
+
+import cupy as cp
+
+from cucim.skimage._vendored import _ndimage_util as util
+from cucim.skimage._vendored._internal import _normalize_axis_index, prod
+from cucim.skimage._vendored._ndimage_filters_core import (
+    _ndimage_CAST_FUNCTION, _ndimage_includes)
+
+
+def _get_constants(ndim, axis, kernel_size, anchor, patch_per_block=None):
+    if anchor is None:
+        anchor = kernel_size // 2
+    halo_pixels_needed = max(kernel_size - anchor, anchor)
+    if patch_per_block is None:
+        patch_per_block = 4
+
+    if ndim == 2:
+        # note, in 2d axis 0 = "y"
+        #             axis 1 = "x"
+        # for simplicity, keeping same halo size at both start and end
+        if axis == 1:
+            # as in OpenCV's column_filter.hpp
+            block_x = 16
+            block_y = 16
+            halo_size = math.ceil(halo_pixels_needed / block_x)
+        elif axis == 0:
+            # as in OpenCV's row_filter.hpp
+            block_x = 32  # 16 in CUDA example
+            block_y = 8  # 4 in CUDA example
+            halo_size = math.ceil(halo_pixels_needed / block_y)
+        # can have out of bounds access unless patch_per_block >= halo_size
+        patch_per_block = max(patch_per_block, halo_size)
+        block_z = 1
+    elif ndim == 3:
+        # note, in 3d axis 0 = "z"
+        #             axis 1 = "y"
+        #             axis 2 = "x"
+        # for simplicity, keeping same halo size at both start and end
+        if axis == 2:
+            # as in OpenCV's column_filter.hpp
+            block_x = 16
+            block_y = 4
+            block_z = 4
+            halo_size = math.ceil(halo_pixels_needed / block_x)
+        elif axis == 1:
+            # as in OpenCV's column_filter.hpp
+            block_x = 32
+            block_y = 4
+            block_z = 4
+            halo_size = math.ceil(halo_pixels_needed / block_y)
+        elif axis == 0:
+            # as in OpenCV's row_filter.hpp
+            block_x = 32
+            block_y = 4
+            block_z = 4
+            halo_size = math.ceil(halo_pixels_needed / block_z)
+        # can have out of bounds access unless patch_per_block >= halo_size
+        patch_per_block = max(patch_per_block, halo_size)
+    else:
+        raise NotImplementedError("Only 2D and 3D are currently supported")
+    block = (block_x, block_y, block_z)
+    return block, patch_per_block, halo_size
+
+
+def _get_smem_shape(ndim, axis, block, patch_per_block, halo_size, anchor=None,
+                    image_dtype=cp.float32):
+    bx, by, bz = block
+    if ndim == 2:
+        if axis == 0:
+            shape = ((patch_per_block + 2 * halo_size) * by, bx)
+        elif axis == 1:
+            shape = (by, (patch_per_block + 2 * halo_size) * bx)
+    elif ndim == 3:
+        if axis == 0:
+            shape = ((patch_per_block + 2 * halo_size) * bz, by, bx)
+        elif axis == 1:
+            shape = (bz, (patch_per_block + 2 * halo_size) * by, bx)
+        elif axis == 2:
+            shape = (bz, by, (patch_per_block + 2 * halo_size) * bx)
+    else:
+        raise NotImplementedError("TODO")
+    nbytes = cp.dtype(image_dtype).itemsize * prod(shape)
+    return shape, nbytes
+
+
+def _get_warp_size(device_id=None):
+    if device_id is None:
+        device_id = cp.cuda.runtime.getDevice()
+    device_props = cp.cuda.runtime.getDeviceProperties(device_id)
+    return device_props['warpSize']
+
+
+def _get_shmem_limits(device_id=None):
+    if device_id is None:
+        device_id = cp.cuda.runtime.getDevice()
+    device_props = cp.cuda.runtime.getDeviceProperties(device_id)
+    shared_mp = device_props.get('sharedMemPerMultiprocessor', None)
+    shared_block = device_props.get('sharedMemPerBlock', None)
+    shared_block_optin = device_props.get('sharedMemPerBlockOptin', None)
+    global_l1_cache_supported = device_props.get('globalL1CacheSupported', None)
+    local_l1_cache_supported = device_props.get('localL1CacheSupported', None)
+    l2_size = device_props.get('l2CacheSize', None)
+    warp_size = device_props.get('warpSize', None)
+    regs_per_block = device_props.get('regsPerBlock', None)
+    return {
+        'device_id': device_id,
+        'shared_mp': shared_mp,
+        'shared_block': shared_block,
+        'shared_block_optin': shared_block_optin,
+        'global_l1_cache_supported': global_l1_cache_supported,
+        'local_l1_cache_supported': local_l1_cache_supported,
+        'l2_size': l2_size,
+        'warp_size': warp_size,
+        'regs_per_block': regs_per_block,
+    }
+
+
+class ResourceLimitError(RuntimeError):
+    pass
+
+
+@cp.memoize(for_each_device=True)
+def _check_smem_availability(ndim, axis, kernel_size, anchor=None,
+                             patch_per_block=None, image_dtype=cp.float32,
+                             device_id=None):
+    block, patch_per_block, halo_size = _get_constants(
+        ndim, axis, kernel_size, anchor=anchor, patch_per_block=patch_per_block
+    )
+    shape, nbytes = _get_smem_shape(
+        ndim, axis, block, patch_per_block, halo_size, image_dtype
+    )
+    props = _get_shmem_limits(device_id=device_id)
+    if nbytes > props['shared_block']:
+        raise ResourceLimitError("inadequate shared memory available")
+
+
+_dtype_char_to_c_types = {
+    'e': 'float16',
+    'f': 'float',
+    'd': 'double',
+    'F': 'complex<float>',
+    'D': 'complex<double>',
+    '?': 'char',
+    'b': 'char',
+    'h': 'short',
+    'i': 'int',
+    'l': 'long long',
+    'B': 'unsigned char',
+    'H': 'unsigned short',
+    'I': 'unsigned int',
+    'L': 'unsigned long long',
+}
+
+
+def _get_code_stage1_shared_memory_load_2d(ndim, axis, mode, cval):
+    """Generates the first stage of the function body.
+
+    This involves just copying from the `src` array into the `smem` shared
+    memory array followed by a call to __syncthreads(). All boundary
+    handling also occurs within this function.
+    """
+
+    if ndim == 2 and axis == 0:
+        if mode not in ['constant', 'grid-constant']:
+            boundary_code_lower, boundary_code_upper = util._generate_boundary_condition_ops(mode, 'row', 'n_rows', separate=True)  # noqa
+
+        # as in OpenCV's column_filter.hpp
+        code = """
+        __shared__ T smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+        if (x >= n_cols){
+            return;
+        }
+        const T* src_col = &src[x];
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        // memory is contiguous along last (columns) axis
+        const int row_stride = n_cols;  // stride (in elements) along axis 0
+        int row;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y) * row_stride];
+        }
+        else
+        {
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j) {
+                row = yStart - (HALO_SIZE - j) * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row < 0)
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_lower
+        code += """
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = src_col[row * row_stride];
+            }
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)  // Note: +2 here assumes HALO_SIZE <= PATCH_PER_BLOCK so we ensure that elsewhere
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart + j * BLOCK_DIM_Y) * row_stride];
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y) * row_stride];
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j) {
+                row = yStart + j * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= n_rows)
+                    smem[threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[row * row_stride];
+            }
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+            {
+                row = yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= n_rows)
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[row * row_stride];
+            }
+        }
+        """  # noqa
+    elif ndim == 2 and axis == 1:
+        if mode not in ['constant', 'grid-constant']:
+            boundary_code_lower, boundary_code_upper = util._generate_boundary_condition_ops(mode, 'col', 'n_cols', separate=True)  # noqa
+
+        # as in OpenCV's row_filter.hpp
+        code = """
+        __shared__ T smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+        if (y >= n_rows) {
+            return;
+        }
+        const int row_stride = n_cols;  // stride (in elements) along axis 0
+        int col;
+        const T* src_row = &src[y * row_stride];
+        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+        if (blockIdx.x > 0)
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X];
+        }
+        else
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j){
+                col = xStart - (HALO_SIZE - j) * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col < 0)
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_lower
+        code += """
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = src_row[col];
+            }
+        }
+        if (blockIdx.x + 2 < gridDim.x)  // Note: +2 here assumes HALO_SIZE <= PATCH_PER_BLOCK so we ensure that elsewhere
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y][threadIdx.x + (HALO_SIZE + j)* BLOCK_DIM_X] = src_row[xStart + j * BLOCK_DIM_X];
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X];
+        }
+        else
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j) {
+                col = xStart + j * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col >= n_cols)
+                    smem[threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X] = src_row[col];
+            }
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j){
+                col = xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col >= n_cols)
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = src_row[col];
+            }
+        }
+        """  # noqa
+
+    code += """
+        __syncthreads();
+    """
+    return code
+
+
+def _get_code_stage1_shared_memory_load_3d(ndim, axis, mode, cval):
+    """Generates the first stage of the function body.
+
+    This involves just copying from the `src` array into the `smem` shared
+    memory array followed by a call to __syncthreads(). All boundary
+    handling also occurs within this function.
+    """
+
+    if ndim == 3 and axis == 0:
+        if mode not in ['constant', 'grid-constant']:
+            boundary_code_lower, boundary_code_upper = util._generate_boundary_condition_ops(mode, 'row', 's_0', separate=True)  # noqa
+
+        # as in OpenCV's column_filter.hpp
+        code = """
+        __shared__ T smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Z][BLOCK_DIM_Y][BLOCK_DIM_X];
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+        if ((x >= s_2) || (y >= s_1)) {
+            return;
+        }
+        // memory is contiguous along last (columns) axis
+        const int stride_0 = s_1 * s_2;  // stride (in elements) along axis 0
+        const int stride_1 = s_2;  // stride (in elements) along axis 1
+
+        const T* src_col = &src[x + stride_1 * y];
+        const int zStart = blockIdx.z * (BLOCK_DIM_Z * PATCH_PER_BLOCK) + threadIdx.z;
+
+        int row;
+
+        if (blockIdx.z > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z + j * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[(zStart - (HALO_SIZE - j) * BLOCK_DIM_Z) * stride_0];
+        }
+        else
+        {
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j) {
+                row = zStart - (HALO_SIZE - j) * BLOCK_DIM_Z;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row < 0)
+                    smem[threadIdx.z + j * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_lower
+        code += """
+                    smem[threadIdx.z + j * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[row * stride_0];
+            }
+        }
+
+        if (blockIdx.z + 2 < gridDim.z)  // Note: +2 here assumes HALO_SIZE <= PATCH_PER_BLOCK so we ensure that elsewhere
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.z + (HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[(zStart + j * BLOCK_DIM_Z) * stride_0];
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[(zStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Z) * stride_0];
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j) {
+                row = zStart + j * BLOCK_DIM_Z;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= s_0)
+                    smem[threadIdx.z + (HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z + (HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[row * stride_0];
+            }
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+            {
+                row = zStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Z;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= s_0)
+                    smem[threadIdx.z + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Z][threadIdx.y][threadIdx.x] = src_col[row * stride_0];
+            }
+        }
+        """  # noqa
+    elif ndim == 3 and axis == 1:
+        if mode not in ['constant', 'grid-constant']:
+            boundary_code_lower, boundary_code_upper = util._generate_boundary_condition_ops(mode, 'row', 's_1', separate=True)  # noqa
+
+        # as in OpenCV's column_filter.hpp
+        code = """
+        __shared__ T smem[BLOCK_DIM_Z][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+        const int z = blockIdx.z * BLOCK_DIM_Z + threadIdx.z;
+        if ((x >= s_2) || (z >= s_0)) {
+            return;
+        }
+        // memory is contiguous along last (columns) axis
+        const int stride_0 = s_1 * s_2;  // stride (in elements) along axis 0
+        const int stride_1 = s_2;  // stride (in elements) along axis 1
+
+        const T* src_col = &src[x + stride_0 * z];
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        int row;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z][threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y) * stride_1];
+        }
+        else
+        {
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j) {
+                row = yStart - (HALO_SIZE - j) * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row < 0)
+                    smem[threadIdx.z][threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_lower
+        code += """
+                    smem[threadIdx.z][threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = src_col[row * stride_1];
+            }
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)  // Note: +2 here assumes HALO_SIZE <= PATCH_PER_BLOCK so we ensure that elsewhere
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.z][threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart + j * BLOCK_DIM_Y) * stride_1];
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z][threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y) * stride_1];
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j) {
+                row = yStart + j * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= s_1)
+                    smem[threadIdx.z][threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z][threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[row * stride_1];
+            }
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+            {
+                row = yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (row >= s_1)
+                    smem[threadIdx.z][threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z][threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_Y][threadIdx.x] = src_col[row * stride_1];
+            }
+        }
+        """  # noqa
+    elif ndim == 3 and axis == 2:
+        if mode not in ['constant', 'grid-constant']:
+            boundary_code_lower, boundary_code_upper = util._generate_boundary_condition_ops(mode, 'col', 's_2', separate=True)  # noqa
+
+        # as in OpenCV's row_filter.hpp
+        code = """
+        __shared__ T smem[BLOCK_DIM_Z][BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+        const int z = blockIdx.z * BLOCK_DIM_Z + threadIdx.z;
+        if ((y >= s_1) || (z >= s_0)) {
+            return;
+        }
+        const int stride_0 = s_1 * s_2;  // stride (in elements) along axis 0
+        const int stride_1 = s_2;  // stride (in elements) along axis 1
+        int col;
+        const T* src_row = &src[z * stride_0 + y * stride_1];
+        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+        if (blockIdx.x > 0)
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z][threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X];
+        }
+        else
+        {
+            //Load left halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j){
+                col = xStart - (HALO_SIZE - j) * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col < 0)
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_lower
+        code += """
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = src_row[col];
+            }
+        }
+        if (blockIdx.x + 2 < gridDim.x)  // Note: +2 here assumes HALO_SIZE <= PATCH_PER_BLOCK so we ensure that elsewhere
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.z][threadIdx.y][threadIdx.x + (HALO_SIZE + j)* BLOCK_DIM_X] = src_row[xStart + j * BLOCK_DIM_X];
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.z][threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X];
+        }
+        else
+        {
+            //Load main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j) {
+                col = xStart + j * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col >= s_2)
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X] = src_row[col];
+            }
+
+            //Load right halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j){
+                col = xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X;
+        """  # noqa
+        if mode == 'constant':
+            code += f"""
+                if (col >= s_2)
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = static_cast<T>({cval});
+                else
+            """  # noqa
+        else:
+            code += boundary_code_upper
+        code += """
+                    smem[threadIdx.z][threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE + j) * BLOCK_DIM_X] = src_row[col];
+            }
+        }
+        """  # noqa
+
+    code += """
+        __syncthreads();
+    """
+    return code
+
+
+@cp.memoize(for_each_device=False)
+def _get_code_stage1_shared_memory_load(ndim, axis, mode, cval):
+    if ndim == 2:
+        return _get_code_stage1_shared_memory_load_2d(ndim, axis, mode, cval)
+    elif ndim == 3:
+        return _get_code_stage1_shared_memory_load_3d(ndim, axis, mode, cval)
+
+
+def _get_code_stage2_convolve_2d(ndim, axis, flip_kernel):
+    code = """
+    #pragma unroll
+    for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+    {
+    """
+    if flip_kernel:
+        kernel_idx = "KSIZE - 1 - k"
+    else:
+        kernel_idx = "k"
+
+    if ndim == 2 and axis == 0:
+        code += """
+        const int y = yStart + j * BLOCK_DIM_Y;
+
+        if (y < n_rows)
+        {
+        """
+        inner = f"""
+                sum = sum + static_cast<W>(smem[threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y - anchor + k][threadIdx.x]) * kernel[{kernel_idx}];
+        """  # noqa
+    elif ndim == 2 and axis == 1:
+        code += """
+        const int x = xStart + j * BLOCK_DIM_X;
+
+        if (x < n_cols)
+        {
+        """
+        inner = f"""
+                sum = sum + static_cast<W>(smem[threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X - anchor + k]) * kernel[{kernel_idx}];
+        """  # noqa
+    code += f"""
+            W sum = static_cast<W>(0);
+
+            #pragma unroll
+            for (int k = 0; k < KSIZE; ++k) {{
+               {inner}
+            }}
+            dst[y * row_stride + x] = cast<D>(sum);
+        }}
+    }}
+    """
+    return code
+
+
+def _get_code_stage2_convolve_3d(ndim, axis, flip_kernel):
+    code = """
+    #pragma unroll
+    for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+    {
+    """
+    if flip_kernel:
+        kernel_idx = "KSIZE - 1 - k"
+    else:
+        kernel_idx = "k"
+
+    if ndim == 3 and axis == 0:
+        code += """
+        const int z = zStart + j * BLOCK_DIM_Z;
+
+        if (z < s_0)
+        {
+        """
+        inner = f"""
+                sum = sum + static_cast<W>(smem[threadIdx.z + (HALO_SIZE + j) * BLOCK_DIM_Z - anchor + k][threadIdx.y][threadIdx.x]) * kernel[{kernel_idx}];
+        """  # noqa
+    elif ndim == 3 and axis == 1:
+        code += """
+        const int y = yStart + j * BLOCK_DIM_Y;
+
+        if (y < s_1)
+        {
+        """
+        inner = f"""
+                sum = sum + static_cast<W>(smem[threadIdx.z][threadIdx.y + (HALO_SIZE + j) * BLOCK_DIM_Y - anchor + k][threadIdx.x]) * kernel[{kernel_idx}];
+        """  # noqa
+    elif ndim == 3 and axis == 2:
+        code += """
+        const int x = xStart + j * BLOCK_DIM_X;
+
+        if (x < s_2)
+        {
+        """
+        inner = f"""
+                sum = sum + static_cast<W>(smem[threadIdx.z][threadIdx.y][threadIdx.x + (HALO_SIZE + j) * BLOCK_DIM_X - anchor + k]) * kernel[{kernel_idx}];
+        """  # noqa
+    code += f"""
+            W sum = static_cast<W>(0);
+
+            #pragma unroll
+            for (int k = 0; k < KSIZE; ++k) {{
+               {inner}
+            }}
+            dst[z * stride_0 + y * stride_1 + x] = cast<D>(sum);
+        }}
+    }}
+    """
+    return code
+
+
+@cp.memoize(for_each_device=False)
+def _get_code_stage2_convolve(ndim, axis, flip_kernel):
+    if ndim == 2:
+        return _get_code_stage2_convolve_2d(ndim, axis, flip_kernel)
+    elif ndim == 3:
+        return _get_code_stage2_convolve_3d(ndim, axis, flip_kernel)
+
+
+@cp.memoize(for_each_device=True)
+def _get_separable_conv_kernel_src(
+    kernel_size, axis, ndim, anchor, image_c_type, kernel_c_type,
+    output_c_type, mode, cval, patch_per_block=None, flip_kernel=False
+):
+    blocks, patch_per_block, halo_size = _get_constants(
+        ndim, axis, kernel_size, anchor, patch_per_block
+    )
+    block_x, block_y, block_z = blocks
+
+    mode_str = mode
+    if 'constant' in mode_str:
+        mode_str += f'_{cval:0.2f}'.replace('.', '_')
+    mode_str = mode_str.replace('-', '_')
+    if flip_kernel:
+        func_name = f'convolve_s{kernel_size}_{ndim}d_ax{axis}_{mode_str}'
+    else:
+        func_name = f'correlate_s{kernel_size}_{ndim}d_ax{axis}_{mode_str}'
+    func_name += f"_T{image_c_type}_W{kernel_c_type}_D{output_c_type}".replace('complex<', 'c').replace('>', '').replace('long ', 'l').replace('unsigned ', 'u')  # noqa
+    func_name += f"_patch{patch_per_block}_halo{halo_size}"
+    # func_name += f"_bx{block_x}_by{block_y}" // these are fixed per axis
+
+    code = """
+    #include "cupy/carray.cuh"  // for float16
+    #include "cupy/complex.cuh"  // for complex<float>
+    """
+
+    # SciPy-style float -> unsigned integer casting for the output
+    # (use cast<D>(sum) instead of static_cast<D>(sum) for the output)
+    code += _ndimage_includes + _ndimage_CAST_FUNCTION
+
+    code += f"""
+    const int KSIZE = {kernel_size};
+    const int BLOCK_DIM_X = {block_x};
+    const int BLOCK_DIM_Y = {block_y};
+    const int BLOCK_DIM_Z = {block_z};
+    const int PATCH_PER_BLOCK = {patch_per_block};
+    const int HALO_SIZE = {halo_size};
+    typedef {image_c_type}  T;
+    typedef {output_c_type} D;
+    typedef {kernel_c_type} W;
+    """
+
+    if ndim == 2:
+        code += f"""
+        extern "C"{{
+        __global__ void {func_name}(const T *src, D *dst, const W* kernel, const int anchor, int n_rows, int n_cols)
+        {{
+        """  # noqa
+    elif ndim == 3:
+        code += f"""
+        extern "C"{{
+        __global__ void {func_name}(const T *src, D *dst, const W* kernel, const int anchor, int s_0, int s_1, int s_2)
+        {{
+        """  # noqa
+    code += _get_code_stage1_shared_memory_load(ndim, axis, mode, cval)
+    code += _get_code_stage2_convolve(ndim, axis, flip_kernel)
+    code += """
+    }  // end of function
+    }  // extern "C"
+    """
+    return func_name, blocks, patch_per_block, code
+
+
+@cp.memoize(for_each_device=True)
+def _get_separable_conv_kernel(kernel_size, axis, ndim, image_c_type,
+                               kernel_c_type, output_c_type, anchor=None,
+                               mode='nearest', cval=0,
+                               patch_per_block=None, flip_kernel=False):
+    func_name, block, patch_per_block, code = _get_separable_conv_kernel_src(
+        kernel_size=kernel_size,
+        axis=axis,
+        ndim=ndim,
+        image_c_type=image_c_type,
+        kernel_c_type=kernel_c_type,
+        output_c_type=output_c_type,
+        anchor=anchor,
+        mode=mode,
+        cval=cval,
+        patch_per_block=patch_per_block,
+        flip_kernel=flip_kernel,
+    )
+    options = ('--std=c++11', '-DCUPY_USE_JITIFY')
+    m = cp.RawModule(code=code, options=options)
+    return m.get_function(func_name), block, patch_per_block
+
+
+def _get_grid(shape, block, axis, patch_per_block):
+    """Determine grid size from image shape and block parameters"""
+    ndim = len(shape)
+    if ndim == 2:
+        if axis == 0:
+            # column filter
+            grid = (
+                math.ceil(shape[1] / block[0]),
+                math.ceil(shape[0] / (block[1] * patch_per_block)),
+                1,
+            )
+        elif axis == 1:
+            # row filter
+            grid = (
+                math.ceil(shape[1] / (block[0] * patch_per_block)),
+                math.ceil(shape[0] / block[1]),
+                1,
+            )
+        else:
+            raise ValueError(f"invalid axis: {axis}")
+    elif ndim == 3:
+        if axis == 0:
+            # column filter
+            grid = (
+                math.ceil(shape[2] / block[0]),
+                math.ceil(shape[1] / block[1]),
+                math.ceil(shape[0] / (block[2] * patch_per_block)),
+            )
+        elif axis == 1:
+            # row filter
+            grid = (
+                math.ceil(shape[2] / block[0]),
+                math.ceil(shape[1] / (block[1] * patch_per_block)),
+                math.ceil(shape[0] / block[2]),
+            )
+        elif axis == 2:
+            # row filter
+            grid = (
+                math.ceil(shape[2] / (block[0] * patch_per_block)),
+                math.ceil(shape[1] / block[1]),
+                math.ceil(shape[0] / block[2]),
+            )
+        else:
+            raise ValueError(f"invalid axis: {axis}")
+    else:
+        raise NotImplementedError(f"unsupported ndim: {ndim}")
+    return grid
+
+
+def _shmem_convolve1d(image, weights, axis=-1, output=None, mode="reflect",
+                      cval=0.0, origin=0, convolution=False):
+
+    ndim = image.ndim
+    if weights.ndim != 1:
+        raise ValueError("expected 1d weight array")
+    axis = _normalize_axis_index(axis, ndim)
+    origin = util._check_origin(origin, weights.size)
+    if weights.size == 0:
+        return cp.zeros_like(input)
+    util._check_mode(mode)
+
+    if convolution:
+        # use flip_kernel to avoid cp.ascontiguousarray(weights[::-1]))
+        origin = -origin
+        if weights.size % 2 == 0:
+            origin -= 1
+    elif weights.dtype.kind == "c":
+        # numpy.correlate conjugates weights rather than input.
+        weights = weights.conj()
+
+    anchor = weights.size // 2 + origin
+
+    if weights.size > 32:
+        # For large kernels, make sure we have adequate shared memory
+        _check_smem_availability(ndim, axis, weights.size, anchor=anchor,
+                                 patch_per_block=None, image_dtype=image.dtype,
+                                 device_id=None)
+
+    # CUDA kernels assume C-contiguous memory layout
+    if not image.flags.c_contiguous:
+        image = cp.ascontiguousarray(image)
+
+    complex_output = image.dtype.kind == 'c'
+    # Note: important to set use_cucim_casting=True for performance with
+    #       8 and 16-bit integer types. This causes the weights to get cast to
+    #       float32 rather than float64.
+    weights_dtype = util._get_weights_dtype(
+        image, weights, use_cucim_casting=True
+    )
+    if not weights.flags.c_contiguous or weights.dtype != weights_dtype:
+        weights = cp.ascontiguousarray(weights, weights_dtype)
+
+    # promote output to nearest complex dtype if necessary
+    complex_output = complex_output or weights.dtype.kind == 'c'
+    output = util._get_output(output, image, None, complex_output)
+
+    # handle potential overlap between input and output arrays
+    needs_temp = cp.shares_memory(output, image, 'MAY_SHARE_BOUNDS')
+    if needs_temp:
+        output, temp = util._get_output(output.dtype, input), output
+
+    # index_c_type = util._get_inttype(image)
+    image_c_type = _dtype_char_to_c_types[image.dtype.char]
+    weights_c_type = _dtype_char_to_c_types[weights.dtype.char]
+    output_c_type = _dtype_char_to_c_types[output.dtype.char]
+
+    conv_axis_kernel, block, patch_per_block = _get_separable_conv_kernel(
+        weights.size,
+        axis=axis,
+        ndim=ndim,
+        anchor=anchor,
+        image_c_type=image_c_type,
+        kernel_c_type=weights_c_type,
+        output_c_type=output_c_type,
+        mode=mode,
+        cval=cval,
+        patch_per_block=None,
+        flip_kernel=convolution,
+    )
+    grid = _get_grid(image.shape, block, axis, patch_per_block)
+    args = (image, output, weights, anchor) + image.shape[:ndim]
+    conv_axis_kernel(
+        grid,
+        block,
+        args,
+    )
+    if needs_temp:
+        output[:] = temp
+        output = temp
+    return output
diff --git a/python/cucim/src/cucim/skimage/filters/cuda/histogram_median.cu b/python/cucim/src/cucim/skimage/filters/cuda/histogram_median.cu
new file mode 100644
index 000000000..b084383b4
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/filters/cuda/histogram_median.cu
@@ -0,0 +1,251 @@
+/* Several functions and the primary kernel used for the histogram-based
+ * median are in this file.
+ *
+ * Note that this file cannot be compiled standalone as various definitions
+ * and a couple of the supporting functions get dynamically generated based
+ * on the actual histogram sizes. See the Python function
+ * `_get_median_rawkernel` defined in `_median_hist.py`. This function will
+ * generate the full kernel code given a set of parameters.
+ */
+
+__device__ void histogramAddAndSubCoarse(HIST_INT_T* H,
+                                         const HIST_INT_T* hist_colAdd,
+                                         const HIST_INT_T* hist_colSub) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) {
+    H[tx] += hist_colAdd[tx] - hist_colSub[tx];
+  }
+}
+
+__device__ void histogramMultipleAddCoarse(HIST_INT_T* H,
+                                           const HIST_INT_T* hist_col,
+                                           int histCount) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) {
+    HIST_INT_T temp = H[tx];
+    for (int i = 0; i < histCount; i++)
+      temp += hist_col[(i << LOG2_COARSE) + tx];
+    H[tx] = temp;
+  }
+}
+
+__device__ void histogramClearCoarse(HIST_INT_T* H) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) {
+    H[tx] = 0;
+  }
+}
+
+__device__ void histogramAddCoarse(HIST_INT_T* H, const HIST_INT_T* hist_col) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) {
+    H[tx] += hist_col[tx];
+  }
+}
+
+__device__ void histogramSubCoarse(HIST_INT_T* H, const HIST_INT_T* hist_col) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) {
+    H[tx] -= hist_col[tx];
+  }
+}
+
+__device__ void histogramAddFine(HIST_INT_T* H, const HIST_INT_T* hist_col) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_FINE) {
+    H[tx] += hist_col[tx];
+  }
+}
+
+__device__ void histogramAddAndSubFine(HIST_INT_T* H,
+                                       const HIST_INT_T* hist_colAdd,
+                                       const HIST_INT_T* hist_colSub) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_FINE) {
+    H[tx] += hist_colAdd[tx] - hist_colSub[tx];
+  }
+}
+
+__device__ void histogramClearFine(HIST_INT_T* H) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_FINE) {
+    H[tx] = 0;
+  }
+}
+
+__device__ void lucClearCoarse(int* luc) {
+  int tx = threadIdx.x;
+  if (tx < HIST_SIZE_COARSE) luc[tx] = 0;
+}
+
+extern "C" __global__ void cuRankFilterMultiBlock(IMAGE_T* src, IMAGE_T* dest,
+                                                  HIST_INT_T* histPar,
+                                                  HIST_INT_T* coarseHistGrid,
+                                                  int r0, int r1, int medPos_,
+                                                  int rows, int cols) {
+  __shared__ HIST_INT_T HCoarse[HIST_SIZE_COARSE];
+  __shared__ HIST_INT_T HCoarseScan[HIST_SIZE_FINE];
+  __shared__ HIST_INT_T HFine[HIST_SIZE_COARSE][HIST_SIZE_FINE];
+
+  __shared__ int luc[HIST_SIZE_COARSE];
+
+  __shared__ int firstBin, countAtMed, retval;
+
+  // extract values from params array
+  const int row_stride = cols;  // stride (in elements) along axis 0
+
+  int extraRowThread = rows % gridDim.x;
+  int doExtraRow = blockIdx.x < extraRowThread;
+  int startRow = 0, stopRow = 0;
+  int rowsPerBlock = rows / gridDim.x + doExtraRow;
+
+  // The following code partitions the work to the blocks. Some blocks will do
+  // one row more than other blocks. This code is responsible for doing that
+  // balancing
+  if (doExtraRow) {
+    startRow = rowsPerBlock * blockIdx.x;
+    stopRow = min(rows, startRow + rowsPerBlock);
+  } else {
+    startRow = (rowsPerBlock + 1) * extraRowThread +
+               (rowsPerBlock) * (blockIdx.x - extraRowThread);
+    stopRow = min(rows, startRow + rowsPerBlock);
+  }
+
+  HIST_INT_T* hist = histPar + cols * HIST_SIZE * blockIdx.x;
+  HIST_INT_T* histCoarse =
+      coarseHistGrid + cols * HIST_SIZE_COARSE * blockIdx.x;
+
+  if (blockIdx.x == (gridDim.x - 1)) stopRow = rows;
+  __syncthreads();
+  int initNeeded = 0, initStartRow, initStopRow;
+  HIST_INT_T initVal;
+
+  if (blockIdx.x == 0) {
+    // Note: skips one iteration in the initialization loop by starting at
+    // row 1 instead of 0 and using initVal r0+2 instead of r0+1.
+    initNeeded = 1;
+    initVal = r0 + 2;
+    initStartRow = 1;
+    initStopRow = r0;
+  } else if (startRow < (r0 + 2)) {
+    initNeeded = 1;
+    initVal = r0 + 2 - startRow;
+    initStartRow = 1;
+    initStopRow = r0 + startRow;
+  } else {
+    initNeeded = 0;
+    initVal = 0;
+    initStartRow = startRow - (r0 + 1);
+    initStopRow = r0 + startRow;
+  }
+  __syncthreads();
+
+  // In the original algorithm an initialization phase was required as part of
+  // the window was outside the image. In this parallel version, the
+  // initializtion is required for all thread blocks that part of the median
+  // filter is outside the window. For all threads in the block the same code
+  // will be executed.
+  if (initNeeded) {
+    for (int j = threadIdx.x; j < (cols); j += blockDim.x) {
+      hist[j * HIST_SIZE + src[j] + HIST_OFFSET] = initVal;
+      histCoarse[j * HIST_SIZE_COARSE + ((src[j] + HIST_OFFSET) >> LOG2_FINE)] =
+          initVal;
+    }
+  }
+  __syncthreads();
+
+  // For all remaining rows in the median filter, add the values to the the
+  // histogram
+  for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+    for (int i = initStartRow; i < initStopRow; i++) {
+      int pos = min(i, rows - 1);
+      hist[j * HIST_SIZE + src[pos * row_stride + j] + HIST_OFFSET]++;
+      histCoarse[j * HIST_SIZE_COARSE +
+                 ((src[pos * row_stride + j] + HIST_OFFSET) >> LOG2_FINE)]++;
+    }
+  }
+  __syncthreads();
+  // Going through all the rows that the block is responsible for.
+  int inc = blockDim.x * HIST_SIZE;
+  int incCoarse = blockDim.x * HIST_SIZE_COARSE;
+  for (int i = startRow; i < stopRow; i++) {
+    // For every new row that is started the global histogram for the entire
+    // window is restarted.
+
+    histogramClearCoarse(HCoarse);
+    lucClearCoarse(luc);
+    // Computing some necessary indices
+    int possub = max(0, i - r0 - 1), posadd = min(rows - 1, i + r0);
+    int histPos = threadIdx.x * HIST_SIZE;
+    int histCoarsePos = threadIdx.x * HIST_SIZE_COARSE;
+    // Going through all the elements of a specific row. For each histogram, a
+    // value is taken out and one value is added.
+    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
+      hist[histPos + src[possub * row_stride + j] + HIST_OFFSET]--;
+      hist[histPos + src[posadd * row_stride + j] + HIST_OFFSET]++;
+      histCoarse[histCoarsePos +
+                 ((src[possub * row_stride + j] + HIST_OFFSET) >> LOG2_FINE)]--;
+      histCoarse[histCoarsePos +
+                 ((src[posadd * row_stride + j] + HIST_OFFSET) >> LOG2_FINE)]++;
+
+      histPos += inc;
+      histCoarsePos += incCoarse;
+    }
+    __syncthreads();
+
+    histogramMultipleAddCoarse(HCoarse, histCoarse, 2 * r1 + 1);
+    int cols_m_1 = cols - 1;
+
+    for (int j = r1; j < cols - r1; j++) {
+      int possub = max(j - r1, 0);
+      int posadd = min(j + 1 + r1, cols_m_1);
+      int medPos = medPos_;
+      __syncthreads();
+
+      histogramMedianParCoarseLookupOnly(HCoarse, HCoarseScan, medPos,
+                                         &firstBin, &countAtMed);
+      __syncthreads();
+
+      int loopIndex = luc[firstBin];
+      if (loopIndex <= (j - r1)) {
+        histogramClearFine(HFine[firstBin]);
+        for (loopIndex = j - r1; loopIndex < min(j + r1 + 1, cols);
+             loopIndex++) {
+          histogramAddFine(HFine[firstBin], hist + (loopIndex * HIST_SIZE +
+                                                    (firstBin << LOG2_FINE)));
+        }
+      } else {
+        for (; loopIndex < (j + r1 + 1); loopIndex++) {
+          histogramAddAndSubFine(
+              HFine[firstBin],
+              hist + (min(loopIndex, cols_m_1) * HIST_SIZE +
+                      (firstBin << LOG2_FINE)),
+              hist + (max(loopIndex - 2 * r1 - 1, 0) * HIST_SIZE +
+                      (firstBin << LOG2_FINE)));
+          __syncthreads();
+        }
+      }
+      __syncthreads();
+      luc[firstBin] = loopIndex;
+
+      int leftOver = medPos - countAtMed;
+      if (leftOver >= 0) {
+        histogramMedianParFineLookupOnly(HFine[firstBin], HCoarseScan, leftOver,
+                                         &retval, &countAtMed);
+      } else
+        retval = 0;
+      __syncthreads();
+
+      if (threadIdx.x == 0) {
+        dest[i * row_stride + j] =
+            (firstBin << LOG2_FINE) + retval - HIST_OFFSET;
+      }
+      histogramAddAndSubCoarse(HCoarse,
+                               histCoarse + (int)(posadd << LOG2_COARSE),
+                               histCoarse + (int)(possub << LOG2_COARSE));
+
+      __syncthreads();
+    }
+    __syncthreads();
+  }
+}
diff --git a/python/cucim/src/cucim/skimage/filters/edges.py b/python/cucim/src/cucim/skimage/filters/edges.py
index 6b4dcff37..193da4f0d 100644
--- a/python/cucim/src/cucim/skimage/filters/edges.py
+++ b/python/cucim/src/cucim/skimage/filters/edges.py
@@ -13,7 +13,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import _supported_float_type, check_nD
 from ..restoration.uft import laplacian
diff --git a/python/cucim/src/cucim/skimage/filters/tests/test_median.py b/python/cucim/src/cucim/skimage/filters/tests/test_median.py
index 3507032bf..6b0d8e002 100644
--- a/python/cucim/src/cucim/skimage/filters/tests/test_median.py
+++ b/python/cucim/src/cucim/skimage/filters/tests/test_median.py
@@ -2,10 +2,20 @@
 import pytest
 from cupy.testing import assert_allclose
 from cupyx.scipy import ndimage
+from skimage import data
 
 from cucim.skimage._shared.testing import expected_warnings
 from cucim.skimage.filters import median
 
+try:
+    from math import prod
+except ImportError:
+    from functools import reduce
+    from operator import mul
+
+    def prod(x):
+        return reduce(mul, x)
+
 
 @pytest.fixture
 def image():
@@ -17,6 +27,11 @@ def image():
                     dtype=cp.uint8)
 
 
+@pytest.fixture
+def camera():
+    return cp.array(data.camera())
+
+
 # TODO: mode='rank' disabled until it has been implmented
 @pytest.mark.parametrize(
     "mode, cval, behavior, warning_type",
@@ -41,12 +56,167 @@ def test_selem_kwarg_deprecation(image):
 
 # TODO: update if rank.median implemented
 @pytest.mark.parametrize(
-    "behavior, func, params",
-    [('ndimage', ndimage.median_filter, {'size': (3, 3)})]
+    'behavior, func', [('ndimage', ndimage.median_filter)],
     # ('rank', rank.median, {'footprint': cp.ones((3, 3), dtype=cp.uint8)})]
 )
-def test_median_behavior(image, behavior, func, params):
-    assert_allclose(median(image, behavior=behavior), func(image, **params))
+@pytest.mark.parametrize(
+    'mode', ['reflect', 'mirror', 'nearest', 'constant', 'wrap']
+)
+# include even shapes and singleton shape that force non-histogram code path.
+# include some large shapes that always take the histogram-based code path.
+@pytest.mark.parametrize(
+    'footprint_shape', [
+        (3, 3), (5, 5), (9, 15), (2, 2), (1, 1), (2, 7), (23, 23), (15, 35),
+    ]
+)
+@pytest.mark.parametrize('out', [None, cp.uint8, cp.float32, 'array'])
+def test_median_behavior(camera, behavior, func, mode, footprint_shape, out):
+    footprint = cp.ones(footprint_shape, dtype=bool)
+    cam2 = camera[:, :177]  # use anisotropic size
+    assert cam2.dtype == cp.uint8
+    if out == 'array':
+        out = cp.zeros_like(cam2)
+    assert_allclose(
+        median(cam2, footprint, mode=mode, behavior=behavior, out=out),
+        func(cam2, size=footprint.shape, mode=mode, output=out),
+    )
+
+
+@pytest.mark.parametrize(
+    'mode', ['reflect', 'mirror', 'nearest', 'constant', 'wrap']
+)
+# use an anisotropic footprint large enough to trigger the histogram-based path
+@pytest.mark.parametrize('footprint_shape', [(3, 3), (3, 5), (15, 23)])
+@pytest.mark.parametrize(
+    'int_dtype', [cp.uint8, cp.int8, cp.uint16, cp.int16]
+)
+@pytest.mark.parametrize(
+    'algorithm', ['auto', 'histogram', 'sorting']
+)
+@pytest.mark.parametrize(
+    'algorithm_kwargs', [{}, {'partitions': 32}]
+)
+def test_median_hist_dtypes(
+    mode, footprint_shape, int_dtype, algorithm, algorithm_kwargs,
+):
+    footprint = cp.ones(footprint_shape, dtype=bool)
+    rng = cp.random.default_rng(123)
+    shape = (350, 407)
+    if int_dtype == cp.uint8:
+        img = rng.integers(0, 256, shape, dtype=cp.uint8)
+    elif int_dtype == cp.int8:
+        img = rng.integers(-128, 128, shape, dtype=int).astype(cp.int8)
+    elif int_dtype == cp.uint16:
+        if False:
+            # test with 12-bit range stored in 16-bit integers (e.g. DICOM)
+            img = rng.integers(0, 4096, shape, dtype=cp.uint16)
+        else:
+            # smaller dynamic range
+            #    (range 4096 fails only on CI, but couldn't reproduce locally)
+            img = rng.integers(0, 1024, shape, dtype=cp.uint16)
+    elif int_dtype == cp.int16:
+        # chose a limited range of values to test 512 hist_size case
+        img = rng.integers(-128, 384, shape, dtype=int).astype(cp.int16)
+
+    # 150 is the value used to auto-select between sorting vs. histogram
+    small_kernel = prod(footprint_shape) < 150
+    if algorithm_kwargs and (
+        algorithm == 'sorting'
+        or (algorithm == 'auto' and small_kernel)
+    ):
+        msg = ["algorithm_kwargs={'partitions': 32} ignored"]
+    else:
+        msg = []
+    with expected_warnings(msg):
+        out = median(img, footprint, mode=mode, behavior='ndimage',
+                     algorithm=algorithm, algorithm_kwargs=algorithm_kwargs)
+    expected = ndimage.median_filter(img, size=footprint.shape, mode=mode)
+    assert_allclose(expected, out)
+
+
+# TODO: Determine source of isolated remote test failures when 16-bit range
+#       is > 1024. Could not reproduce locally.
+@pytest.mark.parametrize('mode', ['reflect', ])
+# use an anisotropic footprint large enough to trigger the histogram-based path
+@pytest.mark.parametrize('footprint_shape', [(7, 11)])
+@pytest.mark.parametrize(
+    'int_dtype, irange',
+    [
+        (cp.uint16, (0, 256)),
+        (cp.uint16, (0, 15)),
+        (cp.uint16, (128, 384)),
+        (cp.uint16, (0, 200)),
+        (cp.uint16, (0, 510)),
+        (cp.uint16, (500, 550)),
+        (cp.uint16, (0, 1024)),
+        pytest.param(cp.uint16, (0, 2048), marks=pytest.mark.skip(reason="isolated failure on CI only")),  # noqa
+        pytest.param(cp.uint16, (1024, 3185), marks=pytest.mark.skip(reason="isolated failure on CI only")),  # noqa
+        (cp.int16, (0, 256)),
+        (cp.int16, (-15, 15)),
+        (cp.int16, (128, 384)),
+        (cp.int16, (-128, 384)),
+        (cp.int16, (-400, 400)),
+        pytest.param(cp.int16, (-1024, 2048), marks=pytest.mark.skip(reason="isolated failure on CI only")),  # noqa
+        pytest.param(cp.int16, (150, 2048), marks=pytest.mark.skip(reason="isolated failure on CI only")),  # noqa
+    ]
+)
+def test_median_hist_16bit_offsets(mode, footprint_shape, int_dtype, irange):
+    """Make sure 16-bit cases are robust to various value ranges"""
+    footprint = cp.ones(footprint_shape, dtype=bool)
+    rng = cp.random.default_rng(123)
+    shape = (350, 407)
+    if int_dtype == cp.uint16:
+        # test with 12-bit range stored in 16-bit integers (e.g. DICOM)
+        img = rng.integers(irange[0], irange[1], shape, dtype=cp.uint16)
+    elif int_dtype == cp.int16:
+        # chose a limited range of values to test 512 hist_size case
+        img = rng.integers(irange[0], irange[1], shape, dtype=int)
+        img = img.astype(cp.int16)
+    out = median(img, footprint, mode=mode, behavior='ndimage',
+                 algorithm='histogram')
+    expected = ndimage.median_filter(img, size=footprint.shape, mode=mode)
+    assert_allclose(expected, out)
+
+
+@pytest.mark.parametrize('int_dtype', [cp.uint16, cp.int16])
+def test_median_hist_kernel_resource_limit_try_except(int_dtype):
+    # use an anisotropic footprint large enough to trigger
+    # the histogram-based path
+    footprint = cp.ones((15, 23), dtype=bool)
+    mode = 'nearest'
+    rng = cp.random.default_rng(123)
+    shape = (350, 407)  # use anisotropic size
+    if int_dtype == cp.uint16:
+        # test with range likely to exceed the shared memory limit
+        img = rng.integers(0, 65536, shape, dtype=cp.uint16)
+    elif int_dtype == cp.int16:
+        # test with range likely to exceed the shared memory limit
+        img = rng.integers(-32768, 32767, shape, dtype=int).astype(cp.int16)
+    out = median(img, footprint, mode=mode)
+    expected = ndimage.median_filter(img, size=footprint.shape, mode=mode)
+    assert_allclose(expected, out)
+
+
+@pytest.mark.parametrize(
+    'algorithm', ['auto', 'histogram', 'sorting', 'invalid']
+)
+def test_median_algorithm_parameter(algorithm):
+    """Call all algorithms for float32 input.
+    """
+    footprint = cp.ones((15, 23), dtype=bool)
+    mode = 'nearest'
+    rng = cp.random.default_rng(123)
+    shape = (350, 407)  # use anisotropic size
+    img = rng.standard_normal(shape, dtype=cp.float32)
+    if algorithm in ['invalid', 'histogram']:
+        # histogram supports only integer-valued dtypes
+        # 'invalid' is an uncrecognized algorithm
+        with pytest.raises(ValueError):
+            median(img, footprint, mode=mode, algorithm=algorithm)
+    else:
+        out = median(img, footprint, mode=mode, algorithm=algorithm)
+        expected = ndimage.median_filter(img, size=footprint.shape, mode=mode)
+        assert_allclose(expected, out)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cucim/src/cucim/skimage/filters/tests/test_separable_filtering.py b/python/cucim/src/cucim/skimage/filters/tests/test_separable_filtering.py
new file mode 100644
index 000000000..0f2c5897f
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/filters/tests/test_separable_filtering.py
@@ -0,0 +1,333 @@
+import cupy as cp
+import pytest
+
+from cucim.skimage._vendored.ndimage import (convolve1d, correlate1d,
+                                             gaussian_filter, gaussian_filter1d,
+                                             gaussian_gradient_magnitude,
+                                             gaussian_laplace, laplace, prewitt,
+                                             sobel, uniform_filter,
+                                             uniform_filter1d)
+
+
+def _get_image(shape, dtype, seed=123):
+    rng = cp.random.default_rng(seed)
+    dtype = cp.dtype(dtype)
+    if dtype.kind == 'b':
+        image = rng.integers(0, 1, shape, dtype=cp.uint8).astype(bool)
+    elif dtype.kind in 'iu':
+        image = rng.integers(0, 128, shape, dtype=dtype)
+    elif dtype.kind in 'c':
+        real_dtype = cp.asarray([], dtype=dtype).real.dtype
+        image = rng.standard_normal(shape, dtype=real_dtype)
+        image = image + 1j * rng.standard_normal(shape, dtype=real_dtype)
+    else:
+        if dtype == cp.float16:
+            image = rng.standard_normal(shape).astype(dtype)
+        else:
+            image = rng.standard_normal(shape, dtype=dtype)
+    return image
+
+
+def _get_rtol_atol(dtype):
+    real_dtype = cp.array([], dtype=dtype).real.dtype
+    rtol = atol = 1e-5
+    if real_dtype == cp.float64:
+        rtol = atol = 1e-12
+    elif real_dtype == cp.float16:
+        rtol = atol = 1e-3
+    return rtol, atol
+
+
+def _compare_implementations(
+    shape, kernel_size, axis, dtype, mode, cval=0.0, origin=0,
+    output_dtype=None, kernel_dtype=None, output_preallocated=False,
+    function=convolve1d,
+):
+    dtype = cp.dtype(dtype)
+    if kernel_dtype is None:
+        kernel_dtype = dtype
+    image = _get_image(shape, dtype)
+    kernel = _get_image((kernel_size,), kernel_dtype)
+    rtol, atol = _get_rtol_atol(kernel.dtype)
+    kwargs = dict(axis=axis, mode=mode, cval=cval, origin=origin)
+    if output_dtype is not None:
+        output_dtype = cp.dtype(output_dtype)
+    if output_preallocated:
+        if output_dtype is None:
+            output_dtype = image.dtype
+        output1 = cp.empty(image.shape, dtype=output_dtype)
+        output2 = cp.empty(image.shape, dtype=output_dtype)
+        function(
+            image, kernel, output=output1, algorithm='elementwise', **kwargs
+        )
+        function(
+            image, kernel, output=output2, algorithm='shared_memory', **kwargs
+        )
+        cp.testing.assert_allclose(output1, output2, rtol=rtol, atol=atol)
+        return
+    output1 = function(
+        image, kernel, output=output_dtype, algorithm='elementwise', **kwargs
+    )
+    output2 = function(
+        image, kernel, output=output_dtype, algorithm='shared_memory', **kwargs
+    )
+    cp.testing.assert_allclose(output1, output2, rtol=rtol, atol=atol)
+    return
+
+
+def _compare_implementations_other(
+    shape, dtype, mode, cval=0.0,
+    output_dtype=None, kernel_dtype=None, output_preallocated=False,
+    function=convolve1d, func_kwargs={},
+):
+    dtype = cp.dtype(dtype)
+    image = _get_image(shape, dtype)
+    rtol, atol = _get_rtol_atol(image.dtype)
+    kwargs = dict(mode=mode, cval=cval)
+    if func_kwargs:
+        kwargs.update(func_kwargs)
+    if output_dtype is not None:
+        output_dtype = cp.dtype(output_dtype)
+    if output_preallocated:
+        if output_dtype is None:
+            output_dtype = image.dtype
+        output1 = cp.empty(image.shape, dtype=output_dtype)
+        output2 = cp.empty(image.shape, dtype=output_dtype)
+        function(image, output=output1, algorithm='elementwise', **kwargs)
+        function(image, output=output2, algorithm='shared_memory', **kwargs)
+        cp.testing.assert_allclose(output1, output2, rtol=rtol, atol=atol)
+        return
+    output1 = function(
+        image, output=output_dtype, algorithm='elementwise', **kwargs
+    )
+    output2 = function(
+        image, output=output_dtype, algorithm='shared_memory', **kwargs
+    )
+    cp.testing.assert_allclose(output1, output2, rtol=rtol, atol=atol)
+    return
+
+
+@pytest.mark.parametrize('shape', ((64, 57), (1000, 500)))
+@pytest.mark.parametrize('axis', (0, 1))
+@pytest.mark.parametrize('origin', ('min', 0, 'max'))
+@pytest.mark.parametrize('kernel_size', tuple(range(1, 17)))
+@pytest.mark.parametrize('function', [convolve1d, correlate1d])
+def test_separable_kernel_sizes_and_origins(
+    shape, axis, origin, kernel_size, function
+):
+    if kernel_size == 1:
+        origin = 0
+    elif origin == 'min':
+        origin = -(kernel_size // 2)
+    elif origin == 'max':
+        origin = kernel_size // 2
+        if kernel_size % 2 == 0:
+            origin -= 1
+    _compare_implementations(
+        shape,
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode='nearest',
+        origin=origin,
+        function=function,
+    )
+
+
+@pytest.mark.parametrize('shape', ((64, 57), (1000, 500)))
+@pytest.mark.parametrize('axis', (0, 1))
+@pytest.mark.parametrize(
+    'kernel_size',
+    tuple(range(17, 129, 11)) + tuple(range(145, 275, 41))
+)
+def test_separable_kernel_larger_sizes(shape, axis, kernel_size):
+    _compare_implementations(
+        shape,
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode='reflect',
+        origin=0,
+    )
+
+
+@pytest.mark.parametrize('shape', ((1000, 500),))
+@pytest.mark.parametrize('axis', (0, 1))
+def test_separable_elementwise_very_large_size_fallback(shape, axis):
+    """Very large kernel to make it likely shared memory will be exceeded."""
+    _compare_implementations(
+        shape,
+        kernel_size=901,
+        axis=axis,
+        dtype=cp.float64,
+        mode='nearest',
+        origin=0,
+    )
+
+
+@pytest.mark.parametrize('shape', ((4000, 2000), (1, 1), (5, 500), (1500, 5)))
+@pytest.mark.parametrize('axis', (-1, -2))
+@pytest.mark.parametrize('kernel_size', (1, 38, 129))
+@pytest.mark.parametrize(
+    'mode',
+    ('nearest', 'reflect', 'wrap', 'mirror', 'constant', ('constant', 1)),
+)
+def test_separable_image_shapes_and_modes(shape, axis, kernel_size, mode):
+
+    if isinstance(mode, tuple):
+        mode, cval = mode
+    else:
+        cval = 0
+
+    _compare_implementations(
+        shape,
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode=mode,
+        cval=cval,
+        origin=0,
+    )
+
+
+image_dtypes_tested = (
+    cp.float16, cp.float32, cp.float64, cp.complex64, cp.complex128, bool,
+    cp.int8, cp.uint8, cp.int16, cp.uint16, cp.int32, cp.uint32, cp.int64,
+    cp.uint64,
+)
+
+
+@pytest.mark.parametrize('axis', (0, 1))
+@pytest.mark.parametrize('image_dtype', image_dtypes_tested)
+@pytest.mark.parametrize(
+    'kernel_dtype', (None, cp.float32, cp.uint8, cp.complex64)
+)
+def test_separable_image_and_kernel_dtypes(axis, image_dtype, kernel_dtype):
+    """Test many kernel and image dtype combinations"""
+
+    _compare_implementations(
+        (64, 32),
+        kernel_size=3,
+        axis=axis,
+        dtype=image_dtype,
+        mode='nearest',
+        origin=0,
+        kernel_dtype=kernel_dtype,
+    )
+
+
+@pytest.mark.parametrize('axis', (0, 1))
+@pytest.mark.parametrize('image_dtype', image_dtypes_tested)
+@pytest.mark.parametrize(
+    'output_dtype', (None, cp.float32, cp.int32, cp.complex64)
+)
+@pytest.mark.parametrize('output_preallocated', (False, True))
+def test_separable_input_and_output_dtypes(
+    axis, image_dtype, output_dtype, output_preallocated
+):
+    """Test many kernel and image dtype combinations"""
+    if cp.dtype(image_dtype).kind == 'c' and output_dtype is not None:
+        if not cp.dtype(output_dtype).kind == 'c':
+            pytest.skip('cannot cast complex values to real')
+    _compare_implementations(
+        (64, 32),
+        kernel_size=3,
+        axis=axis,
+        dtype=image_dtype,
+        mode='nearest',
+        origin=0,
+        kernel_dtype=None,
+        output_dtype=output_dtype,
+        output_preallocated=output_preallocated,
+    )
+
+
+@pytest.mark.parametrize('shape', ((64, 57),))
+@pytest.mark.parametrize('axis', (0, 1))
+@pytest.mark.parametrize('origin', ('min', 0, 'max'))
+@pytest.mark.parametrize(
+    'function, func_kwargs',
+    [
+        (gaussian_filter, dict(sigma=1.5)),
+        (gaussian_filter1d, dict(sigma=1.5, axis=0)),
+        (gaussian_filter1d, dict(sigma=1.5, axis=1)),
+        (gaussian_gradient_magnitude, dict(sigma=3.5)),
+        (gaussian_laplace, dict(sigma=2.5)),
+        (laplace, {}),
+        (prewitt, {}),
+        (sobel, {}),
+        (uniform_filter, dict(size=7)),
+        (uniform_filter1d, dict(size=7, axis=0)),
+        (uniform_filter1d, dict(size=7, axis=1)),
+    ]
+)
+def test_separable_internal_kernel(
+    shape, axis, origin, function, func_kwargs
+):
+    """
+    Test case to make sure the 'algorithm' kwarg works for all other separable
+    ndimage filters as well.
+    """
+    _compare_implementations_other(
+        shape,
+        dtype=cp.float32,
+        mode='nearest',
+        function=function,
+        func_kwargs=func_kwargs,
+    )
+
+
+@pytest.mark.parametrize('shape', ((16, 24, 32), (192, 128, 160)))
+@pytest.mark.parametrize('axis', (0, 1, 2))
+@pytest.mark.parametrize('kernel_size', tuple(range(1, 17, 3)))
+@pytest.mark.parametrize('function', [convolve1d, correlate1d])
+def test_separable_kernel_sizes_3d(
+    shape, axis, kernel_size, function
+):
+    _compare_implementations(
+        shape,
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode='nearest',
+        origin=0,
+        function=function,
+    )
+
+
+@pytest.mark.parametrize('axis', (0, 1, 2))
+@pytest.mark.parametrize('kernel_size', (65, 129, 198))
+def test_separable_large_kernel_3d(axis, kernel_size):
+    _compare_implementations(
+        shape=(256, 128, 96),
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode='reflect',
+        origin=0,
+    )
+
+
+@pytest.mark.parametrize(
+    'shape', ((64, 5, 64), (5, 64, 64), (64, 64, 5), (32, 32, 32))
+)
+@pytest.mark.parametrize('axis', (-1, -2, -3))
+@pytest.mark.parametrize('kernel_size', (9,))
+@pytest.mark.parametrize(
+    'mode',
+    ('nearest', 'reflect', 'wrap', 'mirror', 'constant', ('constant', 1)),
+)
+def test_separable_image_shapes_and_modes_3d(shape, axis, kernel_size, mode):
+    if isinstance(mode, tuple):
+        mode, cval = mode
+    else:
+        cval = 0
+    _compare_implementations(
+        shape,
+        kernel_size=kernel_size,
+        axis=axis,
+        dtype=cp.float32,
+        mode=mode,
+        cval=cval,
+        origin=0,
+    )
diff --git a/python/cucim/src/cucim/skimage/filters/thresholding.py b/python/cucim/src/cucim/skimage/filters/thresholding.py
index 8ca274209..14acc342e 100644
--- a/python/cucim/src/cucim/skimage/filters/thresholding.py
+++ b/python/cucim/src/cucim/skimage/filters/thresholding.py
@@ -6,7 +6,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 from skimage.filters import threshold_isodata as _threshold_isodata_cpu
 from skimage.filters import threshold_minimum as _threshold_minimum_cpu
 from skimage.filters import threshold_multiotsu as _threshold_multiotsu_cpu
@@ -735,8 +735,8 @@ def threshold_li(image, *, tolerance=None, initial_guess=None,
     return threshold
 
 
-@deprecate_kwarg({'max_iter': 'max_num_iter'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg({'max_iter': 'max_num_iter'}, removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def threshold_minimum(image=None, nbins=256, max_num_iter=10000, *, hist=None):
     """Return threshold value based on minimum method.
 
diff --git a/python/cucim/src/cucim/skimage/measure/_blur_effect.py b/python/cucim/src/cucim/skimage/measure/_blur_effect.py
index 4e0fe6fe9..69421512e 100644
--- a/python/cucim/src/cucim/skimage/measure/_blur_effect.py
+++ b/python/cucim/src/cucim/skimage/measure/_blur_effect.py
@@ -1,5 +1,5 @@
 import cupy as cp
-import cupyx.scipy.ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from ..color import rgb2gray
 from ..util import img_as_float
diff --git a/python/cucim/src/cucim/skimage/measure/_label.py b/python/cucim/src/cucim/skimage/measure/_label.py
index 9dc2fc2bd..c9969cdff 100644
--- a/python/cucim/src/cucim/skimage/measure/_label.py
+++ b/python/cucim/src/cucim/skimage/measure/_label.py
@@ -16,8 +16,8 @@ def _get_structure(ndim, connectivity):
 
 # TODO: currently uses int32 for the labels. should add int64 option as well
 @deprecate_kwarg({'input': 'label_image'},
-                 deprecated_version='0.19',
-                 removed_version='1.0')
+                 deprecated_version='22.02.00',
+                 removed_version='23.02.00')
 def label(label_image, background=None, return_num=False, connectivity=None):
     r"""Label connected regions of an integer array.
 
diff --git a/python/cucim/src/cucim/skimage/measure/profile.py b/python/cucim/src/cucim/skimage/measure/profile.py
index 6643dc034..fc0b0cf1f 100644
--- a/python/cucim/src/cucim/skimage/measure/profile.py
+++ b/python/cucim/src/cucim/skimage/measure/profile.py
@@ -2,7 +2,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import _fix_ndimage_mode, _validate_interpolation_order
 
diff --git a/python/cucim/src/cucim/skimage/measure/tests/test_blur_effect.py b/python/cucim/src/cucim/skimage/measure/tests/test_blur_effect.py
index ff5b2ac5e..3b0013005 100644
--- a/python/cucim/src/cucim/skimage/measure/tests/test_blur_effect.py
+++ b/python/cucim/src/cucim/skimage/measure/tests/test_blur_effect.py
@@ -1,6 +1,7 @@
 import cupy as cp
+import pytest
 from cupy.testing import assert_array_equal
-from skimage.data import astronaut, cells3d
+from skimage.data import astronaut
 
 from cucim.skimage.color import rgb2gray
 from cucim.skimage.filters import gaussian
@@ -47,6 +48,7 @@ def test_blur_effect_channel_axis():
 
 def test_blur_effect_3d():
     """Test that the blur metric works on a 3D image."""
+    cells3d = pytest.importorskip('skimage.data.cells3d')
     image_3d = cp.array(cells3d()[:, 1, :, :])  # grab just the nuclei
     B0 = blur_effect(image_3d)
     B1 = blur_effect(gaussian(image_3d, sigma=1))
diff --git a/python/cucim/src/cucim/skimage/metrics/_structural_similarity.py b/python/cucim/src/cucim/skimage/metrics/_structural_similarity.py
index 6bfa07d31..44d0e28e5 100644
--- a/python/cucim/src/cucim/skimage/metrics/_structural_similarity.py
+++ b/python/cucim/src/cucim/skimage/metrics/_structural_similarity.py
@@ -1,7 +1,7 @@
 import functools
 
 import cupy as cp
-from cupyx.scipy.ndimage import uniform_filter
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared import utils
 from .._shared.filters import gaussian
@@ -187,7 +187,7 @@ def structural_similarity(im1, im2,
         filter_func = gaussian
         filter_args = {'sigma': sigma, 'truncate': truncate, 'mode': 'reflect'}
     else:
-        filter_func = uniform_filter
+        filter_func = ndi.uniform_filter
         filter_args = {'size': win_size}
 
     # ndimage filters need floating point data
diff --git a/python/cucim/src/cucim/skimage/morphology/__init__.py b/python/cucim/src/cucim/skimage/morphology/__init__.py
index e3a5423b4..cf1fbb774 100644
--- a/python/cucim/src/cucim/skimage/morphology/__init__.py
+++ b/python/cucim/src/cucim/skimage/morphology/__init__.py
@@ -1,4 +1,4 @@
-from ._skeletonize import thin
+from ._skeletonize import medial_axis, thin
 from .binary import (binary_closing, binary_dilation, binary_erosion,
                      binary_opening)
 from .footprints import (ball, cube, diamond, disk, octagon, octahedron,
@@ -32,4 +32,5 @@
     "remove_small_objects",
     "remove_small_holes",
     "thin",
+    "medial_axis",
 ]
diff --git a/python/cucim/src/cucim/skimage/morphology/_medial_axis_lookup.py b/python/cucim/src/cucim/skimage/morphology/_medial_axis_lookup.py
new file mode 100644
index 000000000..37f40b13e
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/morphology/_medial_axis_lookup.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+# medial axis lookup tables (independent of image content)
+#
+# Note: lookup table generated using scikit-image code from
+# https://github.com/scikit-image/scikit-image/blob/38b595d60befe3a0b4c0742995b9737200a079c6/skimage/morphology/_skeletonize.py#L449-L458  # noqa
+
+lookup_table = np.array(
+    [
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
+        0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
+        0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0
+    ],
+    dtype=bool,
+)
+
+
+cornerness_table = np.array(
+    [
+        9, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6,
+        6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5,
+        6, 5, 5, 4, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 7,
+        7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 7, 6, 6, 5, 6, 5, 5, 4,
+        6, 5, 5, 4, 5, 4, 4, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4,
+        4, 3, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 6,
+        7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5,
+        5, 4, 5, 4, 4, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+        6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5,
+        5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3,
+        4, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 5, 4,
+        4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1, 8, 7, 7, 6, 7, 6, 6, 5,
+        7, 6, 6, 5, 6, 5, 5, 4, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4,
+        4, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 6, 5, 5, 4,
+        5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5,
+        5, 4, 5, 4, 4, 3, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+        6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 5, 4, 4, 3, 4, 3,
+        3, 2, 4, 3, 3, 2, 3, 2, 2, 1, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4,
+        5, 4, 4, 3, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 6, 5,
+        5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, 5, 4, 4, 3, 4, 3, 3, 2,
+        4, 3, 3, 2, 3, 2, 2, 1, 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3,
+        3, 2, 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1, 5, 4, 4, 3,
+        4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1, 4, 3, 3, 2, 3, 2, 2, 1, 3, 2,
+        2, 1, 2, 1, 1, 0
+    ],
+    dtype=np.uint8,
+)
diff --git a/python/cucim/src/cucim/skimage/morphology/_skeletonize.py b/python/cucim/src/cucim/skimage/morphology/_skeletonize.py
index 4a93622de..2bc06dec1 100644
--- a/python/cucim/src/cucim/skimage/morphology/_skeletonize.py
+++ b/python/cucim/src/cucim/skimage/morphology/_skeletonize.py
@@ -1,8 +1,15 @@
+import warnings
+
 import cupy as cp
-import cupyx.scipy.ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 import numpy as np
 
+from cucim.core.operations.morphology import distance_transform_edt
+
 from .._shared.utils import check_nD, deprecate_kwarg
+from ._medial_axis_lookup import \
+    cornerness_table as _medial_axis_cornerness_table
+from ._medial_axis_lookup import lookup_table as _medial_axis_lookup_table
 
 # --------- Skeletonization and thinning based on Guo and Hall 1989 ---------
 
@@ -39,8 +46,8 @@
                        0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=bool)
 
 
-@deprecate_kwarg({'max_iter': 'max_num_iter'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg({"max_iter": "max_num_iter"}, removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def thin(image, max_num_iter=None):
     """
     Perform morphological thinning of a binary image.
@@ -62,7 +69,7 @@ def thin(image, max_num_iter=None):
 
     See Also
     --------
-    skeletonize, medial_axis
+    medial_axis
 
     Notes
     -----
@@ -131,7 +138,7 @@ def thin(image, max_num_iter=None):
         # perform the two "subiterations" described in the paper
         for lut in [G123_LUT, G123P_LUT]:
             # correlate image with neighborhood mask
-            N = ndi.correlate(skel, mask, mode='constant')
+            N = ndi.correlate(skel, mask, mode="constant")
             # take deletion decision from this subiteration's LUT
             D = cp.take(lut, N)
             # perform deletion
@@ -141,3 +148,218 @@ def thin(image, max_num_iter=None):
         num_iter += 1
 
     return skel.astype(bool)
+
+
+# --------- Skeletonization by medial axis transform --------
+
+
+def _get_tiebreaker(n, random_seed):
+    # CuPy generator doesn't currently have the permutation method, so
+    # fall back to cp.random.permutation instead.
+    cp.random.seed(random_seed)
+    if n < 2 << 31:
+        dtype = np.int32
+    else:
+        dtype = np.intp
+    tiebreaker = cp.random.permutation(cp.arange(n, dtype=dtype))
+    return tiebreaker
+
+
+def medial_axis(image, mask=None, return_distance=False, *, random_state=None):
+    """Compute the medial axis transform of a binary image.
+
+    Parameters
+    ----------
+    image : binary ndarray, shape (M, N)
+        The image of the shape to be skeletonized.
+    mask : binary ndarray, shape (M, N), optional
+        If a mask is given, only those elements in `image` with a true
+        value in `mask` are used for computing the medial axis.
+    return_distance : bool, optional
+        If true, the distance transform is returned as well as the skeleton.
+    random_state : {None, int, `numpy.random.Generator`}, optional
+        If `random_state` is None the `numpy.random.Generator` singleton is
+        used.
+        If `random_state` is an int, a new ``Generator`` instance is used,
+        seeded with `random_state`.
+        If `random_state` is already a ``Generator`` instance then that
+        instance is used.
+
+        .. versionadded:: 0.19
+
+    Returns
+    -------
+    out : ndarray of bools
+        Medial axis transform of the image
+    dist : ndarray of ints, optional
+        Distance transform of the image (only returned if `return_distance`
+        is True)
+
+    See Also
+    --------
+    skeletonize
+
+    Notes
+    -----
+    This algorithm computes the medial axis transform of an image
+    as the ridges of its distance transform.
+
+    The different steps of the algorithm are as follows
+     * A lookup table is used, that assigns 0 or 1 to each configuration of
+       the 3x3 binary square, whether the central pixel should be removed
+       or kept. We want a point to be removed if it has more than one neighbor
+       and if removing it does not change the number of connected components.
+
+     * The distance transform to the background is computed, as well as
+       the cornerness of the pixel.
+
+     * The foreground (value of 1) points are ordered by
+       the distance transform, then the cornerness.
+
+     * A cython function is called to reduce the image to its skeleton. It
+       processes pixels in the order determined at the previous step, and
+       removes or maintains a pixel according to the lookup table. Because
+       of the ordering, it is possible to process all pixels in only one
+       pass.
+
+    Examples
+    --------
+    >>> square = np.zeros((7, 7), dtype=np.uint8)
+    >>> square[1:-1, 2:-2] = 1
+    >>> square
+    array([[0, 0, 0, 0, 0, 0, 0],
+           [0, 0, 1, 1, 1, 0, 0],
+           [0, 0, 1, 1, 1, 0, 0],
+           [0, 0, 1, 1, 1, 0, 0],
+           [0, 0, 1, 1, 1, 0, 0],
+           [0, 0, 1, 1, 1, 0, 0],
+           [0, 0, 0, 0, 0, 0, 0]], dtype=uint8)
+    >>> medial_axis(square).astype(np.uint8)
+    array([[0, 0, 0, 0, 0, 0, 0],
+           [0, 0, 1, 0, 1, 0, 0],
+           [0, 0, 0, 1, 0, 0, 0],
+           [0, 0, 0, 1, 0, 0, 0],
+           [0, 0, 0, 1, 0, 0, 0],
+           [0, 0, 1, 0, 1, 0, 0],
+           [0, 0, 0, 0, 0, 0, 0]], dtype=uint8)
+
+    """
+    try:
+        from skimage.morphology._skeletonize_cy import _skeletonize_loop
+    except ImportError as e:
+        warnings.warn(
+            "Could not find required private skimage Cython function:\n"
+            "\tskimage.morphology._skeletonize_cy._skeletonize_loop\n"
+        )
+        raise e
+
+    if mask is None:
+        # masked_image is modified in-place later so make a copy of the input
+        masked_image = image.astype(bool, copy=True)
+    else:
+        masked_image = image.astype(bool, copy=True)
+        masked_image[~mask] = False
+
+    # Load precomputed lookup table based on three conditions:
+    # 1. Keep only positive pixels
+    # AND
+    # 2. Keep if removing the pixel results in a different connectivity
+    # (if the number of connected components is different with and
+    # without the central pixel)
+    # OR
+    # 3. Keep if # pixels in neighborhood is 2 or less
+    # Note that this table is independent of the image
+    table = _medial_axis_lookup_table
+
+    # Build distance transform
+    distance = distance_transform_edt(masked_image)
+    if return_distance:
+        store_distance = distance.copy()
+
+    # Corners
+    # The processing order along the edge is critical to the shape of the
+    # resulting skeleton: if you process a corner first, that corner will
+    # be eroded and the skeleton will miss the arm from that corner. Pixels
+    # with fewer neighbors are more "cornery" and should be processed last.
+    # We use a cornerness_table lookup table where the score of a
+    # configuration is the number of background (0-value) pixels in the
+    # 3x3 neighborhood
+    cornerness_table = cp.asarray(_medial_axis_cornerness_table)
+    corner_score = _table_lookup(masked_image, cornerness_table)
+
+    # Define arrays for inner loop
+    distance = distance[masked_image]
+    i, j = cp.where(masked_image)
+
+    # Determine the order in which pixels are processed.
+    # We use a random # for tiebreaking. Assign each pixel in the image a
+    # predictable, random # so that masking doesn't affect arbitrary choices
+    # of skeletons
+    tiebreaker = _get_tiebreaker(n=distance.size, random_seed=random_state)
+    order = cp.lexsort(
+        cp.stack(
+            (tiebreaker, corner_score[masked_image], distance),
+            axis=0
+        )
+    )
+
+    # Call _skeletonize_loop on the CPU. It requies a single pass over the
+    # full array using a specific pixel order, so cannot be run multithreaded!
+    order = cp.asnumpy(order.astype(cp.int32, copy=False))
+    table = cp.asnumpy(table.astype(cp.uint8, copy=False))
+    i = cp.asnumpy(i).astype(dtype=np.intp, copy=False)
+    j = cp.asnumpy(j).astype(dtype=np.intp, copy=False)
+    result = cp.asnumpy(masked_image)
+    # Remove pixels not belonging to the medial axis
+    _skeletonize_loop(result.view(np.uint8), i, j, order, table)
+    result = cp.asarray(result.view(bool), dtype=bool)
+
+    if mask is not None:
+        result[~mask] = image[~mask]
+    if return_distance:
+        return result, store_distance
+    else:
+        return result
+
+
+def _table_lookup(image, table):
+    """
+    Perform a morphological transform on an image, directed by its
+    neighbors
+
+    Parameters
+    ----------
+    image : ndarray
+        A binary image
+    table : ndarray
+        A 512-element table giving the transform of each pixel given
+        the values of that pixel and its 8-connected neighbors.
+
+    Returns
+    -------
+    result : ndarray of same shape as `image`
+        Transformed image
+
+    Notes
+    -----
+    The pixels are numbered like this::
+
+      0 1 2
+      3 4 5
+      6 7 8
+
+    The index at a pixel is the sum of 2**<pixel-number> for pixels
+    that evaluate to true.
+    """
+    #
+    # We accumulate into the indexer to get the index into the table
+    # at each point in the image
+    #
+    # max possible value of indexer is 512, so just use int16 dtype
+    kernel = cp.array(
+        [[256, 128, 64], [32, 16, 8], [4, 2, 1]],
+        dtype=cp.int16
+    )
+    indexer = ndi.convolve(image, kernel, output=np.int16, mode="constant")
+    image = table[indexer]
+    return image
diff --git a/python/cucim/src/cucim/skimage/morphology/binary.py b/python/cucim/src/cucim/skimage/morphology/binary.py
index 56c5b6654..1df98fe00 100644
--- a/python/cucim/src/cucim/skimage/morphology/binary.py
+++ b/python/cucim/src/cucim/skimage/morphology/binary.py
@@ -4,7 +4,7 @@
 import functools
 
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import deprecate_kwarg
 from .footprints import _footprint_is_sequence
@@ -37,8 +37,9 @@ def _iterate_binary_func(binary_func, image, footprint, out):
 # default with the same dimension as the input image and size 3 along each
 # axis.
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def binary_erosion(image, footprint=None, out=None):
     """Return fast binary morphological erosion of an image.
 
@@ -94,8 +95,9 @@ def binary_erosion(image, footprint=None, out=None):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def binary_dilation(image, footprint=None, out=None):
     """Return fast binary morphological dilation of an image.
 
@@ -149,8 +151,9 @@ def binary_dilation(image, footprint=None, out=None):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def binary_opening(image, footprint=None, out=None):
     """Return fast binary morphological opening of an image.
 
@@ -199,8 +202,9 @@ def binary_opening(image, footprint=None, out=None):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def binary_closing(image, footprint=None, out=None):
     """Return fast binary morphological closing of an image.
 
diff --git a/python/cucim/src/cucim/skimage/morphology/footprints.py b/python/cucim/src/cucim/skimage/morphology/footprints.py
index 744f11535..70e8fda7c 100644
--- a/python/cucim/src/cucim/skimage/morphology/footprints.py
+++ b/python/cucim/src/cucim/skimage/morphology/footprints.py
@@ -175,8 +175,8 @@ def _decompose_size(size, kernel_size=3):
 
 
 @deprecate_kwarg({'height': 'ncols', 'width': 'nrows'},
-                 deprecated_version='0.18.0',
-                 removed_version='0.20.0')
+                 deprecated_version='21.06.00',
+                 removed_version='22.02.00')
 def rectangle(nrows, ncols, dtype=cp.uint8, *, decomposition=None):
     """Generates a flat, rectangular-shaped footprint.
 
diff --git a/python/cucim/src/cucim/skimage/morphology/gray.py b/python/cucim/src/cucim/skimage/morphology/gray.py
index ca6fa2845..ce5daf759 100644
--- a/python/cucim/src/cucim/skimage/morphology/gray.py
+++ b/python/cucim/src/cucim/skimage/morphology/gray.py
@@ -4,7 +4,7 @@
 import functools
 
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import deprecate_kwarg
 from ..util import crop
@@ -157,8 +157,8 @@ def func_out(image, footprint, out=None, *args, **kwargs):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 def erosion(image, footprint=None, out=None, shift_x=False, shift_y=False):
     """Return grayscale morphological erosion of an image.
 
@@ -236,8 +236,8 @@ def erosion(image, footprint=None, out=None, shift_x=False, shift_y=False):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 def dilation(image, footprint=None, out=None, shift_x=False, shift_y=False):
     """Return grayscale morphological dilation of an image.
 
@@ -324,8 +324,8 @@ def dilation(image, footprint=None, out=None, shift_x=False, shift_y=False):
     return out
 
 
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 @default_footprint
 @pad_for_eccentric_footprints
 def opening(image, footprint=None, out=None):
@@ -390,8 +390,8 @@ def opening(image, footprint=None, out=None):
     return out
 
 
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 @default_footprint
 @pad_for_eccentric_footprints
 def closing(image, footprint=None, out=None):
@@ -474,8 +474,8 @@ def _white_tophat_seqence(image, footprints, out):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 def white_tophat(image, footprint=None, out=None):
     """Return white top hat of an image.
 
@@ -565,8 +565,8 @@ def white_tophat(image, footprint=None, out=None):
 
 
 @default_footprint
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 def black_tophat(image, footprint=None, out=None):
     """Return black top hat of an image.
 
diff --git a/python/cucim/src/cucim/skimage/morphology/grayreconstruct.py b/python/cucim/src/cucim/skimage/morphology/grayreconstruct.py
index 8847aaaf0..646503f13 100644
--- a/python/cucim/src/cucim/skimage/morphology/grayreconstruct.py
+++ b/python/cucim/src/cucim/skimage/morphology/grayreconstruct.py
@@ -15,8 +15,8 @@
 from .._shared.utils import deprecate_kwarg
 
 
-@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg(kwarg_mapping={'selem': 'footprint'},
+                 removed_version="23.02.00", deprecated_version="22.02.00")
 def reconstruction(seed, mask, method='dilation', footprint=None, offset=None):
     """Perform a morphological reconstruction of an image.
 
diff --git a/python/cucim/src/cucim/skimage/morphology/misc.py b/python/cucim/src/cucim/skimage/morphology/misc.py
index 62ff9bfd0..bf3215a3f 100644
--- a/python/cucim/src/cucim/skimage/morphology/misc.py
+++ b/python/cucim/src/cucim/skimage/morphology/misc.py
@@ -2,7 +2,7 @@
 import functools
 
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import remove_arg, warn
 
@@ -73,7 +73,7 @@ def remove_small_objects(ar, min_size=64, connectivity=1, in_place=False,
         labelling if `ar` is bool.
     in_place : bool, optional (default: False)
         If ``True``, remove the objects in the input array itself.
-        Otherwise, make a copy. Deprecated since version 0.19. Please
+        Otherwise, make a copy. Deprecated since version 22.02.00. Please
         use `out` instead.
     out : ndarray
         Array of the same shape as `ar`, into which the output is
@@ -172,7 +172,7 @@ def remove_small_holes(ar, area_threshold=64, connectivity=1, in_place=False,
         The connectivity defining the neighborhood of a pixel.
     in_place : bool, optional (default: False)
         If `True`, remove the connected components in the input array
-        itself. Otherwise, make a copy. Deprecated since version 0.19.
+        itself. Otherwise, make a copy. Deprecated since version 22.02.00.
         Please use `out` instead.
     out : ndarray
         Array of the same shape as `ar` and bool dtype, into which the
diff --git a/python/cucim/src/cucim/skimage/morphology/tests/test_skeletonize.py b/python/cucim/src/cucim/skimage/morphology/tests/test_skeletonize.py
index 532271c84..2fbafafc1 100644
--- a/python/cucim/src/cucim/skimage/morphology/tests/test_skeletonize.py
+++ b/python/cucim/src/cucim/skimage/morphology/tests/test_skeletonize.py
@@ -5,20 +5,25 @@
 from skimage.morphology import thin as thin_cpu
 
 from cucim.skimage._shared._warnings import expected_warnings
-from cucim.skimage.morphology import thin
+from cucim.skimage.morphology import medial_axis, thin
 
 
-class TestThin():
+class TestThin:
     @property
     def input_image(self):
         """image to test thinning with"""
-        ii = cp.array([[0, 0, 0, 0, 0, 0, 0],
-                       [0, 1, 1, 1, 1, 1, 0],
-                       [0, 1, 0, 1, 1, 1, 0],
-                       [0, 1, 1, 1, 1, 1, 0],
-                       [0, 1, 1, 1, 1, 1, 0],
-                       [0, 1, 1, 1, 1, 1, 0],
-                       [0, 0, 0, 0, 0, 0, 0]], dtype=cp.uint8)
+        ii = cp.array(
+            [
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 1, 1, 1, 1, 1, 0],
+                [0, 1, 0, 1, 1, 1, 0],
+                [0, 1, 1, 1, 1, 1, 0],
+                [0, 1, 1, 1, 1, 1, 0],
+                [0, 1, 1, 1, 1, 1, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+            ],
+            dtype=cp.uint8,
+        )
         return ii
 
     def test_zeros(self):
@@ -26,13 +31,18 @@ def test_zeros(self):
 
     def test_iter_1(self):
         result = thin(self.input_image, 1).astype(cp.uint8)
-        expected = cp.array([[0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 1, 0, 0, 0, 0],
-                             [0, 1, 0, 1, 1, 0, 0],
-                             [0, 0, 1, 1, 1, 0, 0],
-                             [0, 0, 1, 1, 1, 0, 0],
-                             [0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 0, 0, 0]], dtype=cp.uint8)
+        expected = cp.array(
+            [
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0],
+                [0, 1, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 1, 0, 0],
+                [0, 0, 1, 1, 1, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+            ],
+            dtype=cp.uint8,
+        )
         assert_array_equal(result, expected)
 
     def test_max_iter_kwarg_deprecation(self):
@@ -43,13 +53,18 @@ def test_max_iter_kwarg_deprecation(self):
 
     def test_noiter(self):
         result = thin(self.input_image).astype(cp.uint8)
-        expected = cp.array([[0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 1, 0, 0, 0, 0],
-                             [0, 1, 0, 1, 0, 0, 0],
-                             [0, 0, 1, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 0, 0, 0],
-                             [0, 0, 0, 0, 0, 0, 0]], dtype=cp.uint8)
+        expected = cp.array(
+            [
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0],
+                [0, 1, 0, 1, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0, 0, 0],
+            ],
+            dtype=cp.uint8,
+        )
         assert_array_equal(result, expected)
 
     def test_baddim(self):
@@ -57,7 +72,7 @@ def test_baddim(self):
             with pytest.raises(ValueError):
                 thin(ii)
 
-    @pytest.mark.parametrize('invert', [False, True])
+    @pytest.mark.parametrize("invert", [False, True])
     def test_compare_skimage(self, invert):
         h = data.horse()
         if invert:
@@ -65,3 +80,84 @@ def test_compare_skimage(self, invert):
         result = thin(cp.asarray(h))
         expected = thin_cpu(h)
         assert_array_equal(result, expected)
+
+
+class TestMedialAxis:
+    def test_00_00_zeros(self):
+        """Test skeletonize on an array of all zeros"""
+        result = medial_axis(cp.zeros((10, 10), bool))
+        assert not cp.any(result)
+
+    def test_00_01_zeros_masked(self):
+        """Test skeletonize on an array that is completely masked"""
+        result = medial_axis(cp.zeros((10, 10), bool), cp.zeros((10, 10), bool))
+        assert not cp.any(result)
+
+    def test_vertical_line(self):
+        """Test a thick vertical line, issue #3861"""
+        img = cp.zeros((9, 9))
+        img[:, 2] = 1
+        img[:, 3] = 1
+        img[:, 4] = 1
+
+        expected = cp.full(img.shape, False)
+        expected[:, 3] = True
+
+        result = medial_axis(img)
+        assert_array_equal(result, expected)
+
+    def test_01_01_rectangle(self):
+        """Test skeletonize on a rectangle"""
+        image = cp.zeros((9, 15), bool)
+        image[1:-1, 1:-1] = True
+        #
+        # The result should be four diagonals from the
+        # corners, meeting in a horizontal line
+        #
+        expected = cp.array(
+            [
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
+                [0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
+                [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ],
+            dtype=bool,
+        )
+        result = medial_axis(image)
+        assert cp.all(result == expected)
+        result, distance = medial_axis(image, return_distance=True)
+        assert distance.max() == 4
+
+    def test_01_02_hole(self):
+        """Test skeletonize on a rectangle with a hole in the middle"""
+        image = cp.zeros((9, 15), bool)
+        image[1:-1, 1:-1] = True
+        image[4, 4:-4] = False
+        expected = cp.array(
+            [
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
+                [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
+                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            ],
+            dtype=bool,
+        )
+        result = medial_axis(image)
+        assert cp.all(result == expected)
+
+    def test_narrow_image(self):
+        """Test skeletonize on a 1-pixel thin strip"""
+        image = cp.zeros((1, 5), bool)
+        image[:, 1:-1] = True
+        result = medial_axis(image)
+        assert cp.all(result == image)
diff --git a/python/cucim/src/cucim/skimage/registration/_optical_flow.py b/python/cucim/src/cucim/skimage/registration/_optical_flow.py
index 93ce39a5e..efbf3d7c3 100644
--- a/python/cucim/src/cucim/skimage/registration/_optical_flow.py
+++ b/python/cucim/src/cucim/skimage/registration/_optical_flow.py
@@ -9,6 +9,7 @@
 import cupy as cp
 from cupyx.scipy import ndimage as ndi
 
+from .._shared._gradient import gradient
 from .._shared.utils import _supported_float_type
 from ..transform import warp
 from ._optical_flow_utils import coarse_to_fine, get_warp_points
@@ -79,7 +80,8 @@ def _tvl1(reference_image, moving_image, flow0, attachment, tightness,
 
         image1_warp = warp(moving_image, get_warp_points(grid, flow_current),
                            mode='edge')
-        grad = cp.stack(cp.gradient(image1_warp))
+        # output_as_array=True stacks the gradients along the first axis
+        grad = gradient(image1_warp, output_as_array=True)
         NI = (grad * grad).sum(0)
         NI[NI == 0] = 1
 
@@ -288,7 +290,8 @@ def _ilk(reference_image, moving_image, flow0, radius, num_warp, gaussian,
 
         moving_image_warp = warp(moving_image, get_warp_points(grid, flow),
                                  mode='edge')
-        grad = cp.stack(cp.gradient(moving_image_warp), axis=0)
+        # output_as_array=True stacks the gradients along the first axis
+        grad = gradient(moving_image_warp, output_as_array=True)
         error_image = ((grad * flow).sum(axis=0)
                        + reference_image - moving_image_warp)
 
diff --git a/python/cucim/src/cucim/skimage/registration/tests/test_masked_phase_cross_correlation.py b/python/cucim/src/cucim/skimage/registration/tests/test_masked_phase_cross_correlation.py
index 0bafa2c2f..404f64132 100644
--- a/python/cucim/src/cucim/skimage/registration/tests/test_masked_phase_cross_correlation.py
+++ b/python/cucim/src/cucim/skimage/registration/tests/test_masked_phase_cross_correlation.py
@@ -4,7 +4,7 @@
 from cupyx.scipy.ndimage import fourier_shift
 from cupyx.scipy.ndimage import shift as real_shift
 from numpy.testing import assert_almost_equal
-from skimage.data import brain, camera
+from skimage.data import camera
 from skimage.io import imread
 
 from cucim.skimage._shared.fft import fftmodule as fft
@@ -67,6 +67,7 @@ def test_masked_registration_random_masks():
 def test_masked_registration_3d_contiguous_mask():
     """masked_register_translation should be able to register translations
     between volumes with contiguous masks."""
+    brain = pytest.importorskip('skimage.data.brain')
     ref_vol = cp.array(brain()[:, ::2, ::2])
 
     offset = (1, -5, 10)
diff --git a/python/cucim/src/cucim/skimage/restoration/_denoise.py b/python/cucim/src/cucim/skimage/restoration/_denoise.py
index 614c5e7b3..392fc4ad2 100644
--- a/python/cucim/src/cucim/skimage/restoration/_denoise.py
+++ b/python/cucim/src/cucim/skimage/restoration/_denoise.py
@@ -91,8 +91,9 @@ def _denoise_tv_chambolle_nd(image, weight=0.1, eps=2.0e-4, max_num_iter=200):
     return out
 
 
-@utils.deprecate_kwarg({'n_iter_max': 'max_num_iter'}, removed_version="1.0",
-                       deprecated_version="0.19.2")
+@utils.deprecate_kwarg({'n_iter_max': 'max_num_iter'},
+                       removed_version="23.02.00",
+                       deprecated_version="22.06.00")
 @utils.deprecate_multichannel_kwarg(multichannel_position=4)
 def denoise_tv_chambolle(image, weight=0.1, eps=2.0e-4, max_num_iter=200,
                          multichannel=False, *, channel_axis=None):
diff --git a/python/cucim/src/cucim/skimage/restoration/deconvolution.py b/python/cucim/src/cucim/skimage/restoration/deconvolution.py
index 9809637eb..2a1cfa67c 100644
--- a/python/cucim/src/cucim/skimage/restoration/deconvolution.py
+++ b/python/cucim/src/cucim/skimage/restoration/deconvolution.py
@@ -383,8 +383,8 @@ def unsupervised_wiener(image, psf, reg=None, user_params=None, is_real=True,
     return (x_postmean, {'noise': gn_chain, 'prior': gx_chain})
 
 
-@deprecate_kwarg({'iterations': 'num_iter'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg({'iterations': 'num_iter'}, removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def richardson_lucy(image, psf, num_iter=50, clip=True, filter_epsilon=None):
     """Richardson-Lucy deconvolution.
 
diff --git a/python/cucim/src/cucim/skimage/restoration/j_invariant.py b/python/cucim/src/cucim/skimage/restoration/j_invariant.py
index 19f86a444..2add11e72 100644
--- a/python/cucim/src/cucim/skimage/restoration/j_invariant.py
+++ b/python/cucim/src/cucim/skimage/restoration/j_invariant.py
@@ -3,7 +3,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import _supported_float_type
 from ..metrics import mean_squared_error
diff --git a/python/cucim/src/cucim/skimage/restoration/tests/test_j_invariant.py b/python/cucim/src/cucim/skimage/restoration/tests/test_j_invariant.py
index b6a697789..58f3bc0a1 100644
--- a/python/cucim/src/cucim/skimage/restoration/tests/test_j_invariant.py
+++ b/python/cucim/src/cucim/skimage/restoration/tests/test_j_invariant.py
@@ -25,7 +25,7 @@
 # TODO: replace with CuPy version once completed
 def _denoise_wavelet(image, rescale_sigma=True, **kwargs):
 
-    if 'channel_axis' in kwargs and skimage.__version__ < '0.19':
+    if 'channel_axis' in kwargs and skimage.__version__ < '22.02.00':
         # convert channel_axis to older multichannel kwarg for skimage 0.18.x
         channel_axis = kwargs.pop('channel_axis')
         kwargs['multichannel'] = False if channel_axis is None else True
@@ -62,7 +62,7 @@ def test_invariant_denoise_color(dtype):
 
 def test_invariant_denoise_color_deprecated():
 
-    # can remove \A\Z here if only testing with scikit-image 0.19
+    # can remove \A\Z here if only testing with cuCIM 22.02.00
     with expected_warnings([r"`multichannel` is a deprecated argument|\A\Z"]):
         denoised_img_color = _invariant_denoise(
             noisy_img_color, _denoise_wavelet,
diff --git a/python/cucim/src/cucim/skimage/segmentation/__init__.py b/python/cucim/src/cucim/skimage/segmentation/__init__.py
index b5527b8a1..55b65058f 100644
--- a/python/cucim/src/cucim/skimage/segmentation/__init__.py
+++ b/python/cucim/src/cucim/skimage/segmentation/__init__.py
@@ -1,4 +1,6 @@
+from ._chan_vese import chan_vese
 from ._clear_border import clear_border
+from ._expand_labels import expand_labels
 from ._join import join_segmentations, relabel_sequential
 from .boundaries import find_boundaries, mark_boundaries
 from .morphsnakes import (checkerboard_level_set, disk_level_set,
@@ -7,12 +9,14 @@
 from .random_walker_segmentation import random_walker
 
 __all__ = [
+    "expand_labels",
     "random_walker",
     "find_boundaries",
     "mark_boundaries",
     "clear_border",
     "join_segmentations",
     "relabel_sequential",
+    "chan_vese",
     "morphological_geodesic_active_contour",
     "morphological_chan_vese",
     "inverse_gaussian_gradient",
diff --git a/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py b/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py
new file mode 100644
index 000000000..0401c1f07
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py
@@ -0,0 +1,442 @@
+import cupy as cp
+import numpy as np
+from cupyx import rsqrt  # reciprocal sqrt
+
+from cucim.core.operations.morphology import distance_transform_edt
+
+from .._shared.utils import _supported_float_type, deprecate_kwarg
+
+
+@cp.fuse()
+def _fused_curvature(phi, x_start, x_end, y_start, y_end, ul, ur, ll, lr):
+    fy = (y_end - y_start) / 2.0
+    fx = (x_end - x_start) / 2.0
+    fyy = y_end + y_start - 2 * phi
+    fxx = x_end + x_start - 2 * phi
+    fxy = .25 * (lr + ul - ur - ll)
+    grad2 = fx**2 + fy**2
+    K = (fxx * fy**2 - 2 * fxy * fx * fy + fyy * fx**2)
+    K /= (grad2 * cp.sqrt(grad2) + 1e-8)
+    return K
+
+
+def _cv_curvature(phi):
+    """Returns the 'curvature' of a level set 'phi'.
+    """
+    P = cp.pad(phi, 1, mode='edge')
+    y_start = P[:-2, 1:-1]
+    y_end = P[2:, 1:-1]
+    x_start = P[1:-1, :-2]
+    x_end = P[1:-1, 2:]
+
+    lower_right = P[2:, 2:]
+    lower_left = P[2:, :-2]
+    upper_right = P[:-2, 2:]
+    upper_left = P[:-2, :-2]
+    K = _fused_curvature(phi, x_start, x_end, y_start, y_end, upper_left,
+                         upper_right, lower_left, lower_right)
+    return K
+
+
+@cp.fuse()
+def _fused_variance_kernel1(eta, x_start, x_mid, x_end, y_start, y_mid, y_end):
+    phixp = x_end - x_mid
+    phixn = x_mid - x_start
+    phix0 = x_end - x_start
+    phix0 /= 2.0
+    phixp *= phixp
+    phixn *= phixn
+    phix0 *= phix0
+
+    phiyp = y_end - y_mid
+    phiyn = y_mid - y_start
+    phiy0 = y_end - y_start
+    phiy0 /= 2.0
+    phiyp *= phiyp
+    phiyn *= phiyn
+    phiy0 *= phiy0
+
+    C1 = rsqrt(eta + phixp + phiy0)
+    C2 = rsqrt(eta + phixn + phiy0)
+    C3 = rsqrt(eta + phix0 + phiyp)
+    C4 = rsqrt(eta + phix0 + phiyn)
+
+    K = x_end * C1
+    K += x_start * C2
+    K += y_end * C3
+    K += y_start * C4
+
+    Csum = C1
+    Csum += C2
+    Csum += C3
+    Csum += C4
+
+    return K, Csum
+
+
+@cp.fuse()
+def _fused_hphi_hinv(phi):
+    Hphi = (phi > 0).astype(phi.dtype)
+    Hinv = 1.0 - Hphi
+    return Hphi, Hinv
+
+
+@cp.fuse()
+def _fused_variance_kernel2(
+    image, c1, c2, lam1, lam2, phi, K, dt, mu, delta_phi, Csum
+):
+    difference_term = image - c1
+    difference_term *= difference_term
+    difference_term *= -lam1
+
+    term2 = image - c2
+    term2 *= term2
+    term2 *= lam2
+    difference_term += term2
+
+    new_phi = phi + (dt * delta_phi) * (mu * K + difference_term)
+    out = new_phi / (1 + mu * dt * delta_phi * Csum)
+    return out
+
+
+def _cv_calculate_variation(image, phi, mu, lambda1, lambda2, dt):
+    """Returns the variation of level set 'phi' based on algorithm parameters.
+    """
+    eta = 1e-16
+    P = cp.pad(phi, 1, mode='edge')
+
+    x_end = P[1:-1, 2:]
+    x_mid = P[1:-1, 1:-1]
+    x_start = P[1:-1, :-2]
+
+    y_end = P[2:, 1:-1]
+    y_mid = P[1:-1, 1:-1]
+    y_start = P[:-2, 1:-1]
+
+    K, Csum = _fused_variance_kernel1(
+        eta, x_start, x_mid, x_end, y_start, y_mid, y_end
+    )
+    Hphi, Hinv = _fused_hphi_hinv(phi)
+    c1, c2 = _cv_calculate_averages(image, Hphi, Hinv)
+    delta_phi = _cv_delta(phi)
+    out = _fused_variance_kernel2(
+        image, c1, c2, lambda1, lambda2, phi, K, dt, mu, delta_phi, Csum
+    )
+    return out
+
+
+@cp.fuse()
+def _cv_heavyside(x, eps=1.):
+    """Returns the result of a regularised heavyside function of the
+    input value(s).
+    """
+    return 0.5 * (1. + (2. / cp.pi) * cp.arctan(x / eps))
+
+
+@cp.fuse()
+def _cv_delta(x, eps=1.):
+    """Returns the result of a regularised dirac function of the
+    input value(s).
+    """
+    return eps / (eps * eps + x * x)
+
+
+@cp.fuse()
+def _fused_inplace_eps_div(num, denom, eps):
+    denom += eps
+    num /= denom
+    return
+
+
+def _cv_calculate_averages(image, H, Hinv):
+    """Returns the average values 'inside' and 'outside'.
+    """
+    Hsum = cp.sum(H)
+    Hinvsum = cp.sum(Hinv)
+    avg_inside = cp.sum(image * H)
+    avg_oustide = cp.sum(image * Hinv)
+
+    eps = 10 * cp.finfo(image.dtype).eps
+    _fused_inplace_eps_div(avg_inside, Hsum, eps)
+    _fused_inplace_eps_div(avg_oustide, Hinvsum, eps)
+    return (avg_inside, avg_oustide)
+
+
+@cp.fuse()
+def _fused_difference_op1(image, c, h, lam):
+    out = image - c
+    out *= out
+    out *= h
+    out *= lam
+    return out
+
+
+def _cv_difference_from_average_term(image, Hphi, lambda_pos, lambda_neg):
+    """Returns the 'energy' contribution due to the difference from
+    the average value within a region at each point.
+    """
+    Hinv = 1. - Hphi
+    (c1, c2) = _cv_calculate_averages(image, Hphi, Hinv)
+    out = _fused_difference_op1(image, c1, Hphi, lambda_pos)
+    out += _fused_difference_op1(image, c2, Hinv, lambda_neg)
+    return out
+
+
+def _cv_edge_length_term(phi, mu):
+    """Returns the 'energy' contribution due to the length of the
+    edge between regions at each point, multiplied by a factor 'mu'.
+    """
+    e = _cv_curvature(phi)
+    e *= mu
+    return e
+
+
+def _cv_energy(image, phi, mu, lambda1, lambda2):
+    """Returns the total 'energy' of the current level set function.
+    """
+    H = _cv_heavyside(phi)
+    avgenergy = _cv_difference_from_average_term(image, H, lambda1, lambda2)
+    lenenergy = _cv_edge_length_term(phi, mu)
+    return cp.sum(avgenergy) + cp.sum(lenenergy)
+
+
+def _cv_checkerboard(image_size, square_size, dtype=cp.float64):
+    """Generates a checkerboard level set function.
+
+    According to Pascal Getreuer, such a level set function has fast
+    convergence.
+    """
+    yv = cp.arange(image_size[0], dtype=dtype)[:, np.newaxis]
+    xv = cp.arange(image_size[1], dtype=dtype)[np.newaxis, :]
+    sf = cp.pi / square_size
+    xv *= sf
+    yv *= sf
+    cp.sin(xv, out=xv)
+    cp.sin(yv, out=yv)
+    return xv * yv
+
+
+def _cv_large_disk(image_size):
+    """Generates a disk level set function.
+
+    The disk covers the whole image along its smallest dimension.
+    """
+    res = cp.ones(image_size, dtype=bool)
+    centerY = int((image_size[0] - 1) / 2)
+    centerX = int((image_size[1] - 1) / 2)
+    res[centerY, centerX] = 0.
+    radius = float(min(centerX, centerY))
+    out = radius - distance_transform_edt(res)
+    out /= radius
+    return out
+
+
+def _cv_small_disk(image_size):
+    """Generates a disk level set function.
+
+    The disk covers half of the image along its smallest dimension.
+    """
+    res = cp.ones(image_size, dtype=bool)
+    centerY = int((image_size[0] - 1) / 2)
+    centerX = int((image_size[1] - 1) / 2)
+    res[centerY, centerX] = 0.
+    radius = float(min(centerX, centerY)) / 2.0
+    out = radius - distance_transform_edt(res)
+    out /= radius * 3
+    return out
+
+
+def _cv_init_level_set(init_level_set, image_shape, dtype=cp.float64):
+    """Generates an initial level set function conditional on input arguments.
+    """
+    if type(init_level_set) == str:
+        if init_level_set == 'checkerboard':
+            res = _cv_checkerboard(image_shape, 5, dtype)
+        elif init_level_set == 'disk':
+            res = _cv_large_disk(image_shape)
+        elif init_level_set == 'small disk':
+            res = _cv_small_disk(image_shape)
+        else:
+            raise ValueError("Incorrect name for starting level set preset.")
+    else:
+        res = init_level_set
+    return res.astype(dtype, copy=False)
+
+
+@deprecate_kwarg({'max_iter': 'max_num_iter'}, removed_version="1.0",
+                 deprecated_version="0.19")
+def chan_vese(image, mu=0.25, lambda1=1.0, lambda2=1.0, tol=1e-3,
+              max_num_iter=500, dt=0.5, init_level_set='checkerboard',
+              extended_output=False):
+    """Chan-Vese segmentation algorithm.
+
+    Active contour model by evolving a level set. Can be used to
+    segment objects without clearly defined boundaries.
+
+    Parameters
+    ----------
+    image : (M, N) ndarray
+        Grayscale image to be segmented.
+    mu : float, optional
+        'edge length' weight parameter. Higher `mu` values will
+        produce a 'round' edge, while values closer to zero will
+        detect smaller objects.
+    lambda1 : float, optional
+        'difference from average' weight parameter for the output
+        region with value 'True'. If it is lower than `lambda2`, this
+        region will have a larger range of values than the other.
+    lambda2 : float, optional
+        'difference from average' weight parameter for the output
+        region with value 'False'. If it is lower than `lambda1`, this
+        region will have a larger range of values than the other.
+    tol : float, positive, optional
+        Level set variation tolerance between iterations. If the
+        L2 norm difference between the level sets of successive
+        iterations normalized by the area of the image is below this
+        value, the algorithm will assume that the solution was
+        reached.
+    max_num_iter : uint, optional
+        Maximum number of iterations allowed before the algorithm
+        interrupts itself.
+    dt : float, optional
+        A multiplication factor applied at calculations for each step,
+        serves to accelerate the algorithm. While higher values may
+        speed up the algorithm, they may also lead to convergence
+        problems.
+    init_level_set : str or (M, N) ndarray, optional
+        Defines the starting level set used by the algorithm.
+        If a string is inputted, a level set that matches the image
+        size will automatically be generated. Alternatively, it is
+        possible to define a custom level set, which should be an
+        array of float values, with the same shape as 'image'.
+        Accepted string values are as follows.
+
+        'checkerboard'
+            the starting level set is defined as
+            sin(x/5*pi)*sin(y/5*pi), where x and y are pixel
+            coordinates. This level set has fast convergence, but may
+            fail to detect implicit edges.
+        'disk'
+            the starting level set is defined as the opposite
+            of the distance from the center of the image minus half of
+            the minimum value between image width and image height.
+            This is somewhat slower, but is more likely to properly
+            detect implicit edges.
+        'small disk'
+            the starting level set is defined as the
+            opposite of the distance from the center of the image
+            minus a quarter of the minimum value between image width
+            and image height.
+    extended_output : bool, optional
+        If set to True, the return value will be a tuple containing
+        the three return values (see below). If set to False which
+        is the default value, only the 'segmentation' array will be
+        returned.
+
+    Returns
+    -------
+    segmentation : (M, N) ndarray, bool
+        Segmentation produced by the algorithm.
+    phi : (M, N) ndarray of floats
+        Final level set computed by the algorithm.
+    energies : list of floats
+        Shows the evolution of the 'energy' for each step of the
+        algorithm. This should allow to check whether the algorithm
+        converged.
+
+    Notes
+    -----
+    The Chan-Vese Algorithm is designed to segment objects without
+    clearly defined boundaries. This algorithm is based on level sets
+    that are evolved iteratively to minimize an energy, which is
+    defined by weighted values corresponding to the sum of differences
+    intensity from the average value outside the segmented region, the
+    sum of differences from the average value inside the segmented
+    region, and a term which is dependent on the length of the
+    boundary of the segmented region.
+
+    This algorithm was first proposed by Tony Chan and Luminita Vese,
+    in a publication entitled "An Active Contour Model Without Edges"
+    [1]_.
+
+    This implementation of the algorithm is somewhat simplified in the
+    sense that the area factor 'nu' described in the original paper is
+    not implemented, and is only suitable for grayscale images.
+
+    Typical values for `lambda1` and `lambda2` are 1. If the
+    'background' is very different from the segmented object in terms
+    of distribution (for example, a uniform black image with figures
+    of varying intensity), then these values should be different from
+    each other.
+
+    Typical values for mu are between 0 and 1, though higher values
+    can be used when dealing with shapes with very ill-defined
+    contours.
+
+    The 'energy' which this algorithm tries to minimize is defined
+    as the sum of the differences from the average within the region
+    squared and weighed by the 'lambda' factors to which is added the
+    length of the contour multiplied by the 'mu' factor.
+
+    Supports 2D grayscale images only, and does not implement the area
+    term described in the original article.
+
+    References
+    ----------
+    .. [1] An Active Contour Model without Edges, Tony Chan and
+           Luminita Vese, Scale-Space Theories in Computer Vision,
+           1999, :DOI:`10.1007/3-540-48236-9_13`
+    .. [2] Chan-Vese Segmentation, Pascal Getreuer Image Processing On
+           Line, 2 (2012), pp. 214-224,
+           :DOI:`10.5201/ipol.2012.g-cv`
+    .. [3] The Chan-Vese Algorithm - Project Report, Rami Cohen, 2011
+           :arXiv:`1107.2782`
+    """
+    if len(image.shape) != 2:
+        raise ValueError("Input image should be a 2D array.")
+
+    float_dtype = _supported_float_type(image.dtype)
+    phi = _cv_init_level_set(init_level_set, image.shape, dtype=float_dtype)
+    if type(phi) != cp.ndarray or phi.shape != image.shape:
+        raise ValueError("The dimensions of initial level set do not "
+                         "match the dimensions of image.")
+
+    image = image.astype(float_dtype, copy=False)
+    image = image - cp.min(image)
+    if cp.max(image) != 0:
+        image = image / cp.max(image)
+
+    i = 0
+    if extended_output:
+        old_energy = _cv_energy(image, phi, mu, lambda1, lambda2)
+        energies = []
+    phivar = tol + 1
+
+    while phivar > tol and i < max_num_iter:
+        # Save old level set values
+        oldphi = phi
+
+        # Calculate new level set
+        phi = _cv_calculate_variation(image, phi, mu, lambda1, lambda2, dt)
+        phivar = phi - oldphi
+        phivar *= phivar
+        phivar = cp.sqrt(phivar.mean())
+
+        if extended_output:
+            # Extract energy
+            new_energy = _cv_energy(image, phi, mu, lambda1, lambda2)
+
+            # Could compare energy to the previous level set to see if
+            # continuing is necessary
+
+            # Save old energy values
+            energies.append(old_energy)
+            old_energy = new_energy
+        i += 1
+
+    segmentation = phi > 0
+
+    if extended_output:
+        return (segmentation, phi, energies)
+    else:
+        return segmentation
diff --git a/python/cucim/src/cucim/skimage/segmentation/_expand_labels.py b/python/cucim/src/cucim/skimage/segmentation/_expand_labels.py
new file mode 100644
index 000000000..8ec47a7be
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/segmentation/_expand_labels.py
@@ -0,0 +1,96 @@
+import cupy as cp
+
+from cucim.core.operations.morphology import distance_transform_edt
+
+
+def expand_labels(label_image, distance=1):
+    """Expand labels in label image by ``distance`` pixels without overlapping.
+
+    Given a label image, ``expand_labels`` grows label regions (connected components)
+    outwards by up to ``distance`` pixels without overflowing into neighboring regions.
+    More specifically, each background pixel that is within Euclidean distance
+    of <= ``distance`` pixels of a connected component is assigned the label of that
+    connected component.
+    Where multiple connected components are within ``distance`` pixels of a background
+    pixel, the label value of the closest connected component will be assigned (see
+    Notes for the case of multiple labels at equal distance).
+
+    Parameters
+    ----------
+    label_image : ndarray of dtype int
+        label image
+    distance : float
+        Euclidean distance in pixels by which to grow the labels. Default is one.
+
+    Returns
+    -------
+    enlarged_labels : ndarray of dtype int
+        Labeled array, where all connected regions have been enlarged
+
+    Notes
+    -----
+    Where labels are spaced more than ``distance`` pixels are apart, this is
+    equivalent to a morphological dilation with a disc or hyperball of radius ``distance``.
+    However, in contrast to a morphological dilation, ``expand_labels`` will
+    not expand a label region into a neighboring region.  
+
+    This implementation of ``expand_labels`` is derived from CellProfiler [1]_, where
+    it is known as module "IdentifySecondaryObjects (Distance-N)" [2]_.
+
+    There is an important edge case when a pixel has the same distance to
+    multiple regions, as it is not defined which region expands into that
+    space. Here, the exact behavior depends on the upstream implementation
+    of ``scipy.ndimage.distance_transform_edt``.
+
+    See Also
+    --------
+    :func:`cucim.skimage.measure.label`, :func:`cucim.skimage.morphology.dilation`  # noqa
+
+    References
+    ----------
+    .. [1] https://cellprofiler.org
+    .. [2] https://github.com/CellProfiler/CellProfiler/blob/082930ea95add7b72243a4fa3d39ae5145995e9c/cellprofiler/modules/identifysecondaryobjects.py#L559  # noqa
+
+    Examples
+    --------
+    >>> labels = np.array([0, 1, 0, 0, 0, 0, 2])
+    >>> expand_labels(labels, distance=1)
+    array([1, 1, 1, 0, 0, 2, 2])
+
+    Labels will not overwrite each other:
+
+    >>> expand_labels(labels, distance=3)
+    array([1, 1, 1, 1, 2, 2, 2])
+
+    In case of ties, behavior is undefined, but currently resolves to the
+    label closest to ``(0,) * ndim`` in lexicographical order.
+
+    >>> labels_tied = np.array([0, 1, 0, 2, 0])
+    >>> expand_labels(labels_tied, 1)
+    array([1, 1, 1, 2, 2])
+    >>> labels2d = np.array(
+    ...     [[0, 1, 0, 0],
+    ...      [2, 0, 0, 0],
+    ...      [0, 3, 0, 0]]
+    ... )
+    >>> expand_labels(labels2d, 1)
+    array([[2, 1, 1, 0],
+           [2, 2, 0, 0],
+           [2, 3, 3, 0]])
+    """
+
+    distances, nearest_label_coords = distance_transform_edt(
+        label_image == 0, return_indices=True
+    )
+    labels_out = cp.zeros_like(label_image)
+    dilate_mask = distances <= distance
+    # build the coordinates to find nearest labels,
+    # in contrast to [1] this implementation supports label arrays
+    # of any dimension
+    masked_nearest_label_coords = [
+        dimension_indices[dilate_mask]
+        for dimension_indices in nearest_label_coords
+    ]
+    nearest_labels = label_image[tuple(masked_nearest_label_coords)]
+    labels_out[dilate_mask] = nearest_labels
+    return labels_out
diff --git a/python/cucim/src/cucim/skimage/segmentation/boundaries.py b/python/cucim/src/cucim/skimage/segmentation/boundaries.py
index 62724d9fc..d631c28e9 100644
--- a/python/cucim/src/cucim/skimage/segmentation/boundaries.py
+++ b/python/cucim/src/cucim/skimage/segmentation/boundaries.py
@@ -1,5 +1,5 @@
 import cupy as cp
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import _supported_float_type
 from ..color import gray2rgb
diff --git a/python/cucim/src/cucim/skimage/segmentation/morphsnakes.py b/python/cucim/src/cucim/skimage/segmentation/morphsnakes.py
index 9938cb9d1..dd3f60a82 100644
--- a/python/cucim/src/cucim/skimage/segmentation/morphsnakes.py
+++ b/python/cucim/src/cucim/skimage/segmentation/morphsnakes.py
@@ -3,10 +3,12 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+from cupyx import rsqrt
+import cucim.skimage._vendored.ndimage as ndi
 
 from cucim import _misc
 
+from .._shared._gradient import gradient
 from .._shared.utils import check_nD, deprecate_kwarg
 
 __all__ = ['morphological_chan_vese',
@@ -29,64 +31,53 @@ def __call__(self, *args, **kwargs):
 
 
 # SI and IS operators for 2D and 3D.
-_P2 = [np.eye(3),
-       np.array([[0, 1, 0]] * 3),
-       np.flipud(np.eye(3)),
-       np.rot90([[0, 1, 0]] * 3)]
-_P3 = [np.zeros((3, 3, 3)) for i in range(9)]
-
-_P3[0][:, :, 1] = 1
-_P3[1][:, 1, :] = 1
-_P3[2][1, :, :] = 1
-_P3[3][:, [0, 1, 2], [0, 1, 2]] = 1
-_P3[4][:, [0, 1, 2], [2, 1, 0]] = 1
-_P3[5][[0, 1, 2], :, [0, 1, 2]] = 1
-_P3[6][[0, 1, 2], :, [2, 1, 0]] = 1
-_P3[7][[0, 1, 2], [0, 1, 2], :] = 1
-_P3[8][[0, 1, 2], [2, 1, 0], :] = 1
-
-
-def sup_inf(u):
+def _get_P2():
+    _P2 = [cp.eye(3),
+           cp.array([[0, 1, 0]] * 3),
+           cp.array(np.flipud(np.eye(3))),
+           cp.array(np.rot90([[0, 1, 0]] * 3))]
+    return _P2
+
+
+def _get_P3():
+    _P3 = [np.zeros((3, 3, 3)) for i in range(9)]
+
+    _P3[0][:, :, 1] = 1
+    _P3[1][:, 1, :] = 1
+    _P3[2][1, :, :] = 1
+    _P3[3][:, [0, 1, 2], [0, 1, 2]] = 1
+    _P3[4][:, [0, 1, 2], [2, 1, 0]] = 1
+    _P3[5][[0, 1, 2], :, [0, 1, 2]] = 1
+    _P3[6][[0, 1, 2], :, [2, 1, 0]] = 1
+    _P3[7][[0, 1, 2], [0, 1, 2], :] = 1
+    _P3[8][[0, 1, 2], [2, 1, 0], :] = 1
+    return [cp.array(p) for p in _P3]
+
+
+def sup_inf(u, footprints, workspace=None):
     """SI operator."""
-
-    if _misc.ndim(u) == 2:
-        P = _P2
-    elif _misc.ndim(u) == 3:
-        P = _P3
+    if workspace is None:
+        erosions = cp.empty(((len(footprints),) + u.shape), dtype=u.dtype)
     else:
-        raise ValueError("u has an invalid number of dimensions "
-                         "(should be 2 or 3)")
+        erosions = workspace
+    for i, footprint in enumerate(footprints):
+        erosions[i, ...] = ndi.binary_erosion(u, footprint)
+    return erosions.max(0)
 
-    erosions = []
-    for P_i in P:
-        e = ndi.binary_erosion(u, cp.asarray(P_i)).astype(np.int8, copy=False)
-        erosions.append(e)
 
-    return cp.stack(erosions, axis=0).max(0)
-
-
-def inf_sup(u):
+def inf_sup(u, footprints, workspace=None):
     """IS operator."""
-
-    if _misc.ndim(u) == 2:
-        P = _P2
-    elif _misc.ndim(u) == 3:
-        P = _P3
+    if workspace is None:
+        dilations = cp.empty(((len(footprints),) + u.shape), dtype=u.dtype)
     else:
-        raise ValueError("u has an invalid number of dimensions "
-                         "(should be 2 or 3)")
-
-    dilations = []
-    for P_i in P:
-        d = ndi.binary_dilation(u, cp.asarray(P_i)).astype(np.int8,
-                                                           copy=False)
-        dilations.append(d)
-
-    return cp.stack(dilations, axis=0).min(0)
+        dilations = workspace
+    for i, footprint in enumerate(footprints):
+        dilations[i, ...] = ndi.binary_dilation(u, footprint)
+    return dilations.min(0)
 
 
-_curvop = _fcycle([lambda u: sup_inf(inf_sup(u)),   # SIoIS
-                   lambda u: inf_sup(sup_inf(u))])  # ISoSI
+_curvop = _fcycle([lambda u, f, w: sup_inf(inf_sup(u, f, w), f, w),   # SIoIS
+                   lambda u, f, w: inf_sup(sup_inf(u, f, w), f, w)])  # ISoSI
 
 
 def _check_input(image, init_level_set):
@@ -187,6 +178,11 @@ def checkerboard_level_set(image_shape, square_size=5):
     return res
 
 
+@cp.fuse()
+def _fused_inverse_kernel(gradnorm, alpha):
+    return rsqrt(1.0 + alpha * gradnorm)
+
+
 def inverse_gaussian_gradient(image, alpha=100.0, sigma=5.0):
     """Inverse of gradient magnitude.
 
@@ -216,11 +212,35 @@ def inverse_gaussian_gradient(image, alpha=100.0, sigma=5.0):
         `morphological_geodesic_active_contour`.
     """
     gradnorm = ndi.gaussian_gradient_magnitude(image, sigma, mode='nearest')
-    return 1.0 / cp.sqrt(1.0 + alpha * gradnorm)
+    return _fused_inverse_kernel(gradnorm, alpha)
+
+
+@cp.fuse()
+def _abs_grad_kernel(gx, gy):
+    return cp.abs(gx) + cp.abs(gy)
 
 
-@deprecate_kwarg({'iterations': 'num_iter'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@cp.fuse()
+def _fused_variance_kernel(
+    image, c1, c2, lam1, lam2, abs_du,
+):
+    difference_term = image - c1
+    difference_term *= difference_term
+    difference_term *= lam1
+    term2 = image - c2
+    term2 *= term2
+    term2 *= lam2
+    difference_term -= term2
+
+    aux = abs_du * difference_term
+    aux_lt0 = aux < 0
+    aux_gt0 = aux > 0
+    return aux_lt0, aux_gt0
+
+
+@deprecate_kwarg({'iterations': 'num_iter'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def morphological_chan_vese(image, num_iter, init_level_set='checkerboard',
                             smoothing=1, lambda1=1, lambda2=1,
                             iter_callback=lambda x: None):
@@ -298,36 +318,46 @@ def morphological_chan_vese(image, num_iter, init_level_set='checkerboard',
 
     u = (init_level_set > 0).astype(cp.int8)
 
-    iter_callback(u)
+    if _misc.ndim(u) == 2:
+        footprints = _get_P2()
+    elif _misc.ndim(u) == 3:
+        footprints = _get_P3()
+    else:
+        raise ValueError("u has an invalid number of dimensions "
+                         "(should be 2 or 3)")
+    workspace = cp.empty(((len(footprints),) + u.shape), dtype=u.dtype)
 
-    for _ in range(num_iter):
+    iter_callback(u)
+    for i in range(num_iter):
 
         # inside = u > 0
         # outside = u <= 0
-        c0 = (image * (1 - u)).sum() / float((1 - u).sum() + 1e-8)
-        c1 = (image * u).sum() / float(u.sum() + 1e-8)
+        c0 = (image * (1 - u)).sum()
+        c0 /= float((1 - u).sum() + 1e-8)
+        c1 = (image * u).sum()
+        c1 /= float(u.sum() + 1e-8)
 
         # Image attachment
-        du = cp.gradient(u)
-        abs_du = cp.abs(cp.stack(du, axis=0)).sum(0)
-        aux = abs_du * (
-            lambda1 * (image - c1) ** 2 - lambda2 * (image - c0) ** 2
+        du = gradient(u)
+        abs_du = _abs_grad_kernel(du[0], du[1])
+        aux_lt0, aux_gt0 = _fused_variance_kernel(
+            image, c1, c0, lambda1, lambda2, abs_du
         )
-
-        u[aux < 0] = 1
-        u[aux > 0] = 0
+        u[aux_lt0] = 1
+        u[aux_gt0] = 0
 
         # Smoothing
         for _ in range(smoothing):
-            u = _curvop(u)
+            u = _curvop(u, footprints, workspace)
 
         iter_callback(u)
 
     return u
 
 
-@deprecate_kwarg({'iterations': 'num_iter'}, removed_version="1.0",
-                 deprecated_version="0.19")
+@deprecate_kwarg({'iterations': 'num_iter'},
+                 removed_version="23.02.00",
+                 deprecated_version="22.02.00")
 def morphological_geodesic_active_contour(gimage, num_iter,
                                           init_level_set='disk', smoothing=1,
                                           threshold='auto', balloon=0,
@@ -418,13 +448,22 @@ def morphological_geodesic_active_contour(gimage, num_iter,
         threshold = cp.percentile(image, 40)
 
     structure = cp.ones((3,) * len(image.shape), dtype=cp.int8)
-    dimage = cp.gradient(image)
+    dimage = gradient(image)
     # threshold_mask = image > threshold
     if balloon != 0:
         threshold_mask_balloon = image > threshold / cp.abs(balloon)
 
     u = (init_level_set > 0).astype(cp.int8)
 
+    if _misc.ndim(u) == 2:
+        footprints = _get_P2()
+    elif _misc.ndim(u) == 3:
+        footprints = _get_P3()
+    else:
+        raise ValueError("u has an invalid number of dimensions "
+                         "(should be 2 or 3)")
+    workspace = cp.empty(((len(footprints),) + u.shape), dtype=u.dtype)
+
     iter_callback(u)
 
     for _ in range(num_iter):
@@ -439,7 +478,7 @@ def morphological_geodesic_active_contour(gimage, num_iter,
 
         # Image attachment
         aux = cp.zeros_like(image)
-        du = cp.gradient(u)
+        du = gradient(u)
         for el1, el2 in zip(dimage, du):
             aux += el1 * el2
         u[aux > 0] = 1
@@ -447,7 +486,7 @@ def morphological_geodesic_active_contour(gimage, num_iter,
 
         # Smoothing
         for _ in range(smoothing):
-            u = _curvop(u)
+            u = _curvop(u, footprints, workspace)
 
         iter_callback(u)
 
diff --git a/python/cucim/src/cucim/skimage/segmentation/random_walker_segmentation.py b/python/cucim/src/cucim/skimage/segmentation/random_walker_segmentation.py
index 2acaaf8ba..0ec9ed408 100644
--- a/python/cucim/src/cucim/skimage/segmentation/random_walker_segmentation.py
+++ b/python/cucim/src/cucim/skimage/segmentation/random_walker_segmentation.py
@@ -9,7 +9,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 from cupyx.scipy import sparse
 from cupyx.scipy.sparse.linalg import cg, spsolve
 
@@ -343,12 +343,6 @@ def random_walker(data, labels, beta=130, mode='cg_j', tol=1.e-3, copy=True,
           probability that label `label_nb` reaches the pixel `(i, j)`
           first.
 
-    See Also
-    --------
-    skimage.morphology.watershed : watershed segmentation
-        A segmentation algorithm based on mathematical morphology
-        and "flooding" of regions from markers.
-
     Notes
     -----
     Multichannel inputs are scaled with all channel data combined. Ensure all
diff --git a/python/cucim/src/cucim/skimage/segmentation/tests/test_chan_vese.py b/python/cucim/src/cucim/skimage/segmentation/tests/test_chan_vese.py
new file mode 100644
index 000000000..15b991883
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/segmentation/tests/test_chan_vese.py
@@ -0,0 +1,105 @@
+import cupy as cp
+import pytest
+from cupy.testing import assert_array_equal
+
+from cucim.skimage._shared._warnings import expected_warnings
+from cucim.skimage._shared.utils import _supported_float_type
+from cucim.skimage.segmentation import chan_vese
+
+
+@pytest.mark.parametrize('dtype', [cp.float32, cp.float64])
+def test_chan_vese_flat_level_set(dtype):
+    # because the algorithm evolves the level set around the
+    # zero-level, it the level-set has no zero level, the algorithm
+    # will not produce results in theory. However, since a continuous
+    # approximation of the delta function is used, the algorithm
+    # still affects the entirety of the level-set. Therefore with
+    # infinite time, the segmentation will still converge.
+    img = cp.zeros((10, 10), dtype=dtype)
+    img[3:6, 3:6] = 1
+    ls = cp.full((10, 10), 1000, dtype=dtype)
+    result = chan_vese(img, mu=0.0, tol=1e-3, init_level_set=ls)
+    assert_array_equal(result.astype(float), cp.ones((10, 10)))
+    result = chan_vese(img, mu=0.0, tol=1e-3, init_level_set=-ls)
+    assert_array_equal(result.astype(float), cp.zeros((10, 10)))
+
+
+def test_chan_vese_small_disk_level_set():
+    img = cp.zeros((10, 10))
+    img[3:6, 3:6] = 1
+    result = chan_vese(img, mu=0.0, tol=1e-3, init_level_set="small disk")
+    assert_array_equal(result.astype(float), img)
+
+
+def test_chan_vese_simple_shape():
+    img = cp.zeros((10, 10))
+    img[3:6, 3:6] = 1
+    result = chan_vese(img, mu=0.0, tol=1e-8).astype(float)
+    assert_array_equal(result, img)
+
+
+@pytest.mark.parametrize(
+    'dtype', [cp.uint8, cp.float16, cp.float32, cp.float64]
+)
+def test_chan_vese_extended_output(dtype):
+    img = cp.zeros((10, 10), dtype=dtype)
+    img[3:6, 3:6] = 1
+    result = chan_vese(img, mu=0.0, tol=1e-8, extended_output=True)
+    float_dtype = _supported_float_type(dtype)
+    assert result[1].dtype == float_dtype
+    assert all(arr.dtype == float_dtype for arr in result[2])
+    assert_array_equal(len(result), 3)
+
+
+def test_chan_vese_remove_noise():
+    ref = cp.zeros((10, 10))
+    ref[1:6, 1:6] = cp.array([[0, 1, 1, 1, 0],
+                              [1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 1],
+                              [0, 1, 1, 1, 0]])
+    img = ref.copy()
+    img[8, 3] = 1
+    result = chan_vese(img, mu=0.3, tol=1e-3, max_num_iter=100, dt=10,
+                       init_level_set="disk").astype(float)
+    assert_array_equal(result, ref)
+
+
+def test_chan_vese_incorrect_image_type():
+    img = cp.zeros((10, 10, 3))
+    ls = cp.zeros((10, 9))
+    with pytest.raises(ValueError):
+        chan_vese(img, mu=0.0, init_level_set=ls)
+
+
+def test_chan_vese_gap_closing():
+    ref = cp.zeros((20, 20))
+    ref[8:15, :] = cp.ones((7, 20))
+    img = ref.copy()
+    img[:, 6] = cp.zeros((20))
+    result = chan_vese(img, mu=0.7, tol=1e-3, max_num_iter=1000, dt=1000,
+                       init_level_set="disk").astype(float)
+    assert_array_equal(result, ref)
+
+
+def test_chan_vese_max_iter_deprecation():
+    img = cp.zeros((20, 20))
+    with expected_warnings(["`max_iter` is a deprecated argument"]):
+        chan_vese(img, max_iter=10)
+
+
+def test_chan_vese_incorrect_level_set():
+    img = cp.zeros((10, 10))
+    ls = cp.zeros((10, 9))
+    with pytest.raises(ValueError):
+        chan_vese(img, mu=0.0, init_level_set=ls)
+    with pytest.raises(ValueError):
+        chan_vese(img, mu=0.0, init_level_set="a")
+
+
+def test_chan_vese_blank_image():
+    img = cp.zeros((10, 10))
+    level_set = cp.random.rand(10, 10)
+    ref = level_set > 0
+    result = chan_vese(img, mu=0.0, tol=0.0, init_level_set=level_set)
+    assert_array_equal(result, ref)
diff --git a/python/cucim/src/cucim/skimage/segmentation/tests/test_expand_labels.py b/python/cucim/src/cucim/skimage/segmentation/tests/test_expand_labels.py
new file mode 100644
index 000000000..2fe1df30c
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/segmentation/tests/test_expand_labels.py
@@ -0,0 +1,226 @@
+import cupy as cp
+import pytest
+from cupy.testing import assert_array_equal
+
+from cucim.core.operations.morphology import distance_transform_edt
+from cucim.skimage import data, measure
+from cucim.skimage.segmentation import expand_labels
+
+SAMPLE1D = cp.array([0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])
+SAMPLE1D_EXPANDED_3 = cp.array(
+    [4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
+)
+
+# Some pixels are important edge cases with undefined behaviour:
+# these are the pixels that are at the same distance from
+# multiple labels. Ideally the label would be chosen at random
+# to avoid bias, but as we are relying on the index map returned
+# by the scipy.ndimage distance transform, what actually happens
+# is determined by the upstream implementation of the distance
+# tansform, thus we don't give any guarantees for the edge case pixels.
+#
+# Regardless, it seems prudent to have a test including an edge case
+# so we can detect whether future upstream changes in scipy.ndimage
+# modify the behaviour.
+
+EDGECASE1D = cp.array([0, 0, 4, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0])
+EDGECASE1D_EXPANDED_3 = cp.array(
+    [4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
+)
+
+SAMPLE2D = cp.array(
+    [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    ]
+)
+
+SAMPLE2D_EXPANDED_3 = cp.array(
+    [
+        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 0],
+        [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2],
+        [0, 0, 1, 0, 0, 0, 0, 2, 2, 2, 2],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
+    ]
+)
+
+# non-integer expansion
+SAMPLE2D_EXPANDED_1_5 = cp.array(
+    [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2],
+        [1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2],
+        [0, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2],
+        [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    ]
+)
+
+
+EDGECASE2D = cp.array(
+    [
+        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0],
+        [0, 0, 1, 1, 0, 2, 2, 0, 0, 0, 0],
+        [0, 1, 1, 1, 0, 2, 0, 0, 0, 0, 0],
+        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
+    ]
+)
+
+EDGECASE2D_EXPANDED_4 = cp.array(
+    [
+        [1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
+        [1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
+        [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0],
+        [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0],
+    ]
+)
+
+SAMPLE3D = cp.array(
+    [
+        [
+            [0, 0, 0, 0],
+            [0, 3, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+        ],
+
+        [
+            [0, 0, 0, 0],
+            [0, 3, 3, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+        ],
+
+        [
+            [0, 0, 0, 0],
+            [0, 3, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 5, 0],
+        ],
+
+        [
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 0, 0],
+            [0, 0, 5, 0],
+        ]
+    ]
+)
+
+SAMPLE3D_EXPANDED_2 = cp.array(
+    [
+        [
+            [3, 3, 3, 3],
+            [3, 3, 3, 3],
+            [3, 3, 3, 3],
+            [0, 3, 5, 0],
+        ],
+
+        [
+            [3, 3, 3, 3],
+            [3, 3, 3, 3],
+            [3, 3, 3, 3],
+            [0, 5, 5, 5],
+        ],
+
+        [
+            [3, 3, 3, 3],
+            [3, 3, 3, 3],
+            [3, 3, 5, 5],
+            [5, 5, 5, 5],
+        ],
+
+        [
+            [3, 3, 3, 0],
+            [3, 3, 3, 0],
+            [3, 3, 5, 5],
+            [5, 5, 5, 5],
+        ]
+    ]
+)
+
+SAMPLE_EDGECASE_BEHAVIOUR = cp.array(
+    [[0, 1, 0, 0], [2, 0, 0, 0], [0, 3, 0, 0]]
+)
+
+
+@pytest.mark.parametrize(
+    "input_array, expected_output, expand_distance",
+    [
+        (SAMPLE1D, SAMPLE1D_EXPANDED_3, 3),
+        (SAMPLE2D, SAMPLE2D_EXPANDED_3, 3),
+        (SAMPLE2D, SAMPLE2D_EXPANDED_1_5, 1.5),
+        (EDGECASE1D, EDGECASE1D_EXPANDED_3, 3),
+        (EDGECASE2D, EDGECASE2D_EXPANDED_4, 4),
+        (SAMPLE3D, SAMPLE3D_EXPANDED_2, 2)
+    ]
+)
+def test_expand_labels(input_array, expected_output, expand_distance):
+    if input_array.ndim == 1:
+        with pytest.raises(NotImplementedError):
+            expand_labels(input_array, expand_distance)
+    else:
+        expanded = expand_labels(input_array, expand_distance)
+        assert_array_equal(expanded, expected_output)
+
+
+@pytest.mark.parametrize('ndim', [2, 3])
+@pytest.mark.parametrize('distance', range(6))
+def test_binary_blobs(ndim, distance):
+    """Check some invariants with label expansion.
+
+    - New labels array should exactly contain the original labels array.
+    - Distance to old labels array within new labels should never exceed input
+      distance.
+    - Distance beyond the expanded labels should always exceed the input
+      distance.
+    """
+    img = data.binary_blobs(length=64, blob_size_fraction=0.05, n_dim=ndim)
+    labels = measure.label(img)
+    expanded = expand_labels(labels, distance=distance)
+    original_mask = labels != 0
+    assert_array_equal(labels[original_mask], expanded[original_mask])
+    expanded_only_mask = (expanded - labels).astype(bool)
+    distance_map = distance_transform_edt(~original_mask)
+    expanded_distances = distance_map[expanded_only_mask]
+    if expanded_distances.size > 0:
+        assert cp.all(expanded_distances <= distance)
+    beyond_expanded_distances = distance_map[~expanded.astype(bool)]
+    if beyond_expanded_distances.size > 0:
+        assert cp.all(beyond_expanded_distances > distance)
+
+
+def test_edge_case_behaviour():
+    """ Check edge case behavior to detect upstream changes
+
+    For edge cases where a pixel has the same distance to several regions,
+    lexicographical order seems to determine which region gets to expand
+    into this pixel given the current upstream behaviour in
+    scipy.ndimage.distance_map_edt.
+
+    As a result, we expect different results when transposing the array.
+    If this test fails, something has changed upstream.
+    """
+    expanded = expand_labels(SAMPLE_EDGECASE_BEHAVIOUR, 1)
+    expanded_transpose = expand_labels(SAMPLE_EDGECASE_BEHAVIOUR.T, 1)
+    assert not cp.all(expanded == expanded_transpose.T)
diff --git a/python/cucim/src/cucim/skimage/transform/_geometric.py b/python/cucim/src/cucim/skimage/transform/_geometric.py
index 372cf3b84..91211834b 100644
--- a/python/cucim/src/cucim/skimage/transform/_geometric.py
+++ b/python/cucim/src/cucim/skimage/transform/_geometric.py
@@ -1421,10 +1421,8 @@ def __init__(self, matrix=None, scale=None, rotation=None,
             if translation is None:
                 translation = (0,) * dimensionality
             if dimensionality == 2:
-                ax = (0, 1)
                 c, s = _cos(rotation), _sin(rotation)
-                matrix[ax, ax] = c
-                matrix[ax, ax[::-1]] = -s, s
+                matrix[:2, :2] = xp.array([[c, -s], [s, c]], dtype=float)
             else:  # 3D rotation
                 matrix[:3, :3] = xp.asarray(_euler_rotation_matrix(rotation))
 
diff --git a/python/cucim/src/cucim/skimage/transform/_warps.py b/python/cucim/src/cucim/skimage/transform/_warps.py
index 05aea7f24..7f2f69b2b 100644
--- a/python/cucim/src/cucim/skimage/transform/_warps.py
+++ b/python/cucim/src/cucim/skimage/transform/_warps.py
@@ -2,7 +2,7 @@
 
 import cupy as cp
 import numpy as np
-from cupyx.scipy import ndimage as ndi
+import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import (_to_ndimage_mode, _validate_interpolation_order,
                              channel_as_last_axis, convert_to_float,
@@ -260,8 +260,8 @@ def rescale(image, scale, order=None, mode='reflect', cval=0, clip=True,
         Otherwise, this parameter indicates which axis of the array corresponds
         to channels.
 
-        .. versionadded:: 0.19
-           ``channel_axis`` was added in 0.19.
+        .. versionadded:: 22.02.00
+           ``channel_axis`` was added in 22.02.00.
 
     Notes
     -----
@@ -766,19 +766,33 @@ def _clip_warp_output(input_image, output_image, mode, cval, clip):
     """
     if clip:
         min_val = input_image.min().item()
-        max_val = input_image.max().item()
-
-        preserve_cval = (mode == 'constant' and not
-                         (min_val <= cval <= max_val))
+        if np.isnan(min_val):
+            # NaNs detected, use NaN-safe min/max
+            min_func = cp.nanmin
+            max_func = cp.nanmax
+            min_val = min_func(input_image).item()
+        else:
+            min_func = cp.min
+            max_func = cp.max
+        max_val = max_func(input_image).item()
+
+        # Check if cval has been used such that it expands the effective input
+        # range
+        preserve_cval = (
+            mode == 'constant'
+            and not min_val <= cval <= max_val
+            and min_func(output_image) <= cval <= max_func(output_image)
+        )
 
+        # expand min/max range to account for cval
         if preserve_cval:
-            cval_mask = output_image == cval
+            # cast cval to the same dtype as the input image
+            cval = input_image.dtype.type(cval)
+            min_val = min(min_val, cval)
+            max_val = max(max_val, cval)
 
         cp.clip(output_image, min_val, max_val, out=output_image)
 
-        if preserve_cval:
-            output_image[cval_mask] = cval
-
 
 def warp(image, inverse_map, map_args={}, output_shape=None, order=None,
          mode='constant', cval=0., clip=True, preserve_range=False):
@@ -1080,8 +1094,8 @@ def warp_polar(image, center=None, *, radius=None, output_shape=None,
         Otherwise, this parameter indicates which axis of the array corresponds
         to channels.
 
-        .. versionadded:: 0.19
-           ``channel_axis`` was added in 0.19.
+        .. versionadded:: 22.02.00
+           ``channel_axis`` was added in 22.02.00.
     **kwargs : keyword arguments
         Passed to `transform.warp`.
 
diff --git a/python/cucim/src/cucim/skimage/transform/tests/test_warps.py b/python/cucim/src/cucim/skimage/transform/tests/test_warps.py
index 6504c082d..8c8717f51 100644
--- a/python/cucim/src/cucim/skimage/transform/tests/test_warps.py
+++ b/python/cucim/src/cucim/skimage/transform/tests/test_warps.py
@@ -118,6 +118,78 @@ def test_warp_clip():
     assert_almost_equal(float(outx.max()), 1)
 
 
+@pytest.mark.parametrize('order', [0, 1])
+def test_warp_clip_image_containing_nans(order):
+    # Test that clipping works as intended on an image with NaNs
+    # Orders >1 do not produce good output when the input image has
+    # NaNs, so those orders are not tested.
+
+    x = cp.ones((15, 15), dtype=cp.float64)
+    x[7, 7] = cp.nan
+
+    outx = rotate(x, 45, order=order, cval=2, resize=True, clip=True)
+
+    assert_almost_equal(cp.nanmin(outx).item(), 1)
+    assert_almost_equal(cp.nanmax(outx).item(), 2)
+
+
+@pytest.mark.parametrize('order', [0, 1])
+def test_warp_clip_cval_is_nan(order):
+    # Test that clipping works as intended when cval is NaN
+    # Orders > 1 do not produce good output when cval is NaN, so those
+    # orders are not tested.
+
+    x = cp.ones((15, 15), dtype=cp.float64)
+    x[5:-5, 5:-5] = 2
+
+    outx = rotate(x, 45, order=order, cval=cp.nan, resize=True, clip=True)
+
+    assert_almost_equal(cp.nanmin(outx).item(), 1)
+    assert_almost_equal(cp.nanmax(outx).item(), 2)
+
+
+@pytest.mark.parametrize('order', range(6))
+def test_warp_clip_cval_outside_input_range(order):
+    # Test that clipping behavior considers cval part of the input range
+
+    x = cp.ones((15, 15), dtype=cp.float64)
+
+    # Specify a cval that is outside the input range to check clipping
+    outx = rotate(x, 45, order=order, cval=2, resize=True, clip=True)
+
+    # The corners should be cval for all interpolation orders
+    outx = cp.asnumpy(outx)
+    assert_array_almost_equal([outx[0, 0], outx[0, -1],
+                               outx[-1, 0], outx[-1, -1]], 2)
+
+    # For all interpolation orders other than nearest-neighbor, the clipped
+    # output should have some pixels with values between the input (1) and
+    # cval (2) (i.e., clipping should not set them to 1)
+    if order > 0:
+        assert np.sum(np.less(1, outx) * np.less(outx, 2)) > 0
+
+
+@pytest.mark.parametrize('order', range(6))
+def test_warp_clip_cval_not_used(order):
+    # Test that clipping does not consider cval part of the input range if it
+    # is not used in the output image
+
+    x = cp.ones((15, 15), dtype=cp.float64)
+    x[5:-5, 5:-5] = 2
+
+    # Transform the image by stretching it out by one pixel on each side so
+    # that cval will not actually be used
+    scale = 15 / (15 + 2)
+    transform = AffineTransform(scale=scale, translation=(1, 1))
+    outx = warp(x, transform, mode='constant', order=order, cval=0, clip=True)
+
+    # At higher orders of interpolation, the transformed image has overshoots
+    # beyond the input range that should be clipped to the range 1 to 2.  Even
+    # though cval=0, the minimum value of the clipped output image should be
+    # 1 and not affected by the unused cval.
+    assert_array_almost_equal(outx.min(), 1)
+
+
 def test_homography():
     x = cp.zeros((5, 5), dtype=cp.double)
     x[1, 1] = 1
diff --git a/python/cucim/tests/fixtures/testimage.py b/python/cucim/tests/fixtures/testimage.py
index c50b8eb6f..343698676 100644
--- a/python/cucim/tests/fixtures/testimage.py
+++ b/python/cucim/tests/fixtures/testimage.py
@@ -21,9 +21,9 @@
 from ..util.gen_image import ImageGenerator
 
 
-def gen_image(tmpdir_factory, recipe):
+def gen_image(tmpdir_factory, recipe, resolution=None):
     dataset_path = tmpdir_factory.mktemp('datasets').strpath
-    dataset_gen = ImageGenerator(dataset_path, [recipe])
+    dataset_gen = ImageGenerator(dataset_path, [recipe], [resolution])
     image_path = dataset_gen.gen()
     return (dataset_path, image_path[0])
 
@@ -63,9 +63,8 @@ def testimg_tiff_stripe_32x24_16_raw(tmpdir_factory):
 def testimg_tiff_stripe_32x24_16(request):
     return request.param
 
-# tiff_stripe_4096x4096_256
-
 
+# tiff_stripe_4096x4096_256
 @pytest.fixture(scope='session')
 def testimg_tiff_stripe_4096x4096_256_jpeg(tmpdir_factory):
     dataset_path, image_path = gen_image(
@@ -137,3 +136,45 @@ def testimg_tiff_stripe_100000x100000_256_raw(tmpdir_factory):
 ])
 def testimg_tiff_stripe_100000x100000_256(request):
     return request.param
+
+
+# testimg_tiff_stripe_4096_4096_256_jpeg_resolution
+@pytest.fixture(scope='session')
+def testimg_tiff_stripe_4096_4096_256_jpeg_resolution_3_5_centimeter(
+        tmpdir_factory):
+    resolution = (0.3, 0.5, "CENTIMETER")
+    dataset_path, image_path = gen_image(
+        tmpdir_factory, 'tiff::stripe:4096x4096:256:jpeg', resolution)
+    yield image_path, resolution
+    # Clean up fake dataset folder
+    shutil.rmtree(dataset_path)
+
+
+@pytest.fixture(scope='session')
+def testimg_tiff_stripe_4096_4096_256_jpeg_resolution_4_7_inch(tmpdir_factory):
+    resolution = (0.4, 0.7, "INCH")
+    dataset_path, image_path = gen_image(
+        tmpdir_factory, 'tiff::stripe:4096x4096:256:jpeg', resolution)
+    yield image_path, resolution
+    # Clean up fake dataset folder
+    shutil.rmtree(dataset_path)
+
+
+@pytest.fixture(scope='session')
+def testimg_tiff_stripe_4096_4096_256_jpeg_resolution_9_1_none(tmpdir_factory):
+    resolution = (9, 1, "NONE")
+    dataset_path, image_path = gen_image(
+        tmpdir_factory, 'tiff::stripe:4096x4096:256:jpeg', resolution)
+    yield image_path, resolution
+    # Clean up fake dataset folder
+    shutil.rmtree(dataset_path)
+
+
+@pytest.fixture(scope='session', params=[
+    lazy_fixture(
+        'testimg_tiff_stripe_4096_4096_256_jpeg_resolution_3_5_centimeter'),
+    lazy_fixture('testimg_tiff_stripe_4096_4096_256_jpeg_resolution_4_7_inch'),
+    lazy_fixture('testimg_tiff_stripe_4096_4096_256_jpeg_resolution_9_1_none'),
+])
+def testimg_tiff_stripe_4096_4096_256_jpeg_resolution(request):
+    return request.param
diff --git a/python/cucim/tests/unit/clara/test_load_image_metadata.py b/python/cucim/tests/unit/clara/test_load_image_metadata.py
index 65e7d926b..2452d4324 100644
--- a/python/cucim/tests/unit/clara/test_load_image_metadata.py
+++ b/python/cucim/tests/unit/clara/test_load_image_metadata.py
@@ -14,6 +14,7 @@
 #
 
 from ...util.io import open_image_cucim
+import math
 
 
 def test_load_image_metadata(testimg_tiff_stripe_32x24_16):
@@ -45,7 +46,7 @@ def test_load_image_metadata(testimg_tiff_stripe_32x24_16):
     # Returns physical size in tuple.
     assert img.spacing() == [1.0, 1.0, 1.0]
     # Units for each spacing element (size is same with `ndim`).
-    assert img.spacing_units() == ['micrometer', 'micrometer', 'color']
+    assert img.spacing_units() == ['', '', 'color']
     # Physical location of (0, 0, 0) (size is always 3).
     assert img.origin == [0.0, 0.0, 0.0]
     # Direction cosines (size is always 3x3).
@@ -71,6 +72,54 @@ def test_load_image_metadata(testimg_tiff_stripe_32x24_16):
     assert img.raw_metadata == '{"axes": "YXC", "shape": [24, 32, 3]}'
 
 
+def test_load_image_resolution_metadata(testimg_tiff_stripe_4096_4096_256_jpeg_resolution):  # noqa: E501
+    image, resolution = testimg_tiff_stripe_4096_4096_256_jpeg_resolution
+    img = open_image_cucim(image)
+
+    x_resolution, y_resolution, resolution_unit = resolution
+
+    if resolution_unit == "CENTIMETER":
+        x_spacing = 10000.0 / x_resolution
+        y_spacing = 10000.0 / y_resolution
+        spacing_unit = "micrometer"
+    elif resolution_unit == "INCH":
+        x_spacing = 25400.0 / x_resolution
+        y_spacing = 25400.0 / y_resolution
+        spacing_unit = "micrometer"
+    else:
+        x_spacing = x_resolution
+        y_spacing = y_resolution
+        spacing_unit = ""
+
+    # Returns physical size in tuple.
+    assert all(map(lambda a, b: math.isclose(a, b, rel_tol=0.1),
+                   img.spacing(), (y_spacing, x_spacing, 1.0)))
+    # Units for each spacing element (size is same with `ndim`).
+    assert img.spacing_units() == [spacing_unit, spacing_unit, 'color']
+
+    # A metadata object as `dict`
+    metadata = img.metadata
+    print(metadata)
+    assert isinstance(metadata, dict)
+    assert len(metadata) == 2  # 'cucim' and 'tiff'
+    assert math.isclose(metadata['tiff']['x_resolution'],
+                        x_resolution, rel_tol=0.00001)
+    assert math.isclose(metadata['tiff']['y_resolution'],
+                        y_resolution, rel_tol=0.00001)
+    unit_value = resolution_unit.lower() if resolution_unit != "NONE" else ""
+    assert metadata['tiff']['resolution_unit'] == unit_value
+
+    # Check if lower resolution image's metadata has lower physical spacing.
+    num_levels = img.resolutions['level_count']
+    for level in range(num_levels):
+        lowres_img = img.read_region((0, 0), (100, 100), level=level)
+        lowres_downsample = img.resolutions["level_downsamples"][level]
+        assert all(map(lambda a, b: math.isclose(a, b, rel_tol=0.1),
+                       lowres_img.spacing(),
+                       (y_spacing / lowres_downsample,
+                        x_spacing / lowres_downsample, 1.0)))
+
+
 def test_load_rgba_image_metadata(tmpdir):
     """Test accessing RGBA image's metadata.
 
diff --git a/python/cucim/tests/unit/core/test_stain_normalizer.py b/python/cucim/tests/unit/core/test_stain_normalizer.py
index 4b50a4121..ee66ecf67 100644
--- a/python/cucim/tests/unit/core/test_stain_normalizer.py
+++ b/python/cucim/tests/unit/core/test_stain_normalizer.py
@@ -22,15 +22,15 @@
 
 class TestStainExtractorMacenko():
     @pytest.mark.parametrize(
-        'image',
+        'image, ErrorClass',
         [
-            cp.full((3, 2, 4), -1),   # negative value
-            cp.full((3, 2, 4), 256),  # out of range value
-            None,
-            cp.full((3, 2, 4), 240),  # uniformly below the beta threshold
-        ]
+            (cp.full((3, 2, 4), -1), ValueError),   # negative value
+            (cp.full((3, 2, 4), 256), ValueError),  # out of range value
+            (None, TypeError),
+            (cp.full((3, 2, 4), 240), ValueError),  # uniformly below the beta threshold  # noqa
+        ],
     )
-    def test_transparent_image(self, image):
+    def test_transparent_image(self, image, ErrorClass):
         """
         Test HE stain extraction on an image that comprises
         only transparent pixels - pixels with absorbance below the
@@ -38,12 +38,8 @@ def test_transparent_image(self, image):
         since once the transparent pixels are removed, there are no
         remaining pixels to compute eigenvectors.
         """
-        if image is None:
-            with pytest.raises(TypeError):
-                stain_extraction_pca(image)
-        else:
-            with pytest.raises(ValueError):
-                stain_extraction_pca(image)
+        with pytest.raises(ErrorClass):
+            stain_extraction_pca(image)
 
     @pytest.mark.parametrize(
         'image',
diff --git a/python/cucim/tests/util/gen_image.py b/python/cucim/tests/util/gen_image.py
index 8eea5dc81..645db1f79 100644
--- a/python/cucim/tests/util/gen_image.py
+++ b/python/cucim/tests/util/gen_image.py
@@ -16,6 +16,7 @@
 import argparse
 import logging
 import os
+import tifffile
 
 try:
     from .gen_tiff import TiffGenerator
@@ -28,16 +29,23 @@
 
 
 class ImageGenerator:
-    def __init__(self, dest, recipes, logger=None):
+    def __init__(self, dest, recipes, resolutions=None, logger=None):
         self.logger = logger or logging.getLogger(__name__)
         self.dest = dest
         self.recipes = recipes
 
+        if resolutions is None:
+            resolutions = [(1, 1, "CENTIMETER")] * len(recipes)
+        if len(resolutions) != len(recipes):
+            raise RuntimeError(
+                'Number of resolutions must be equal to number of recipes')
+        self.resolutions = resolutions
+
     def gen(self):
 
         results = []
 
-        for recipe in self.recipes:
+        for recipe, resolution in zip(self.recipes, self.resolutions):
             items = recipe.split(':')
             item_len = len(items)
             if not (1 <= item_len <= 6):
@@ -69,10 +77,16 @@ def gen(self):
                 raise RuntimeError(
                     f'No data generated from [pattern={pattern},'
                     + f' image_size={image_size}, tile_size={tile_size},'
-                    + f' compression={compression}].')
+                    + f' compression={compression}, resolution={resolution}].')
 
             file_name = f'{kind}_{pattern}_{image_size_str}_{tile_size}'
-
+            if resolution is None or len(resolution) == 2:
+                unit = None
+            elif len(resolution) == 3:
+                unit = resolution[2]
+                resolution = resolution[:2]
+            if unit is None:
+                unit = tifffile.RESUNIT.NONE
             image_path = generator_obj.save_image(image_data,
                                                   dest_folder,
                                                   file_name=file_name,
@@ -81,7 +95,9 @@ def gen(self):
                                                   pattern=pattern,
                                                   image_size=image_size,
                                                   tile_size=tile_size,
-                                                  compression=compression)
+                                                  compression=compression,
+                                                  resolution=resolution,
+                                                  resolutionunit=unit)
             self.logger.info('  Generated %s...', image_path)
             results.append(image_path)
 
diff --git a/python/cucim/tests/util/gen_tiff.py b/python/cucim/tests/util/gen_tiff.py
index 713679657..324f8db14 100644
--- a/python/cucim/tests/util/gen_tiff.py
+++ b/python/cucim/tests/util/gen_tiff.py
@@ -38,7 +38,8 @@ def get_image(self, pattern, image_size):
         return None
 
     def save_image(self, image_data, dest_folder, file_name, kind, subpath,
-                   pattern, image_size, tile_size, compression):
+                   pattern, image_size, tile_size, compression, resolution,
+                   resolutionunit):
         # You can add pyramid images (0: largest resolution)
         if isinstance(image_data, list):
             arr_stack = image_data
@@ -55,10 +56,15 @@ def save_image(self, image_data, dest_folder, file_name, kind, subpath,
         tiff_file_name = str(
             (Path(dest_folder) / f'{file_name}.tif').absolute())
 
+        level_resolution = None
         with TiffWriter(tiff_file_name, bigtiff=True) as tif:
             for level in range(len(arr_stack)):  # save from the largest image
                 src_arr = arr_stack[level]
 
+                if resolution:
+                    level_resolution = (resolution[0] / (level + 1),
+                                        resolution[1] / (level + 1))
+
                 tif.write(
                     src_arr,
                     software="tifffile",
@@ -68,6 +74,8 @@ def save_image(self, image_data, dest_folder, file_name, kind, subpath,
                     planarconfig="CONTIG",
                     compression=compression,  # requires imagecodecs
                     subfiletype=1 if level else 0,
+                    resolution=level_resolution,
+                    resolutionunit=resolutionunit,
                 )
         return tiff_file_name
 
diff --git a/run b/run
index 8033eaf18..6fe8ce919 100755
--- a/run
+++ b/run
@@ -806,7 +806,9 @@ test() {
 
 install_python_test_deps_() {
     if [ -n "${CONDA_PREFIX}" ]; then
-        run_command conda install -c conda-forge -y \
+        # https://github.com/rapidsai/cucim/pull/349#issuecomment-1203335731
+        # Do not update or change already-installed dependencies.
+        run_command conda install -c conda-forge -y --freeze-installed \
             --file ${TOP}/python/cucim/requirements-test.txt
     else
         if [ -n "${VIRTUAL_ENV}" ]; then