From ceccac559b5d40891b66fc0405828fce9e5c8d57 Mon Sep 17 00:00:00 2001
From: Dale Black <djblack@uci.edu>
Date: Thu, 26 Dec 2024 09:25:37 -0800
Subject: [PATCH 1/5] replace (gpu) boolean_indicator with AK.jl version

---
 src/utils.jl | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/utils.jl b/src/utils.jl
index 9e086d5..0fe2dbd 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,5 +1,8 @@
 using KernelAbstractions
 using GPUArraysCore: AbstractGPUArray
+import AcceleratedKernels as AK
+
+export boolean_indicator
 
 """
 ## `boolean_indicator`
@@ -46,25 +49,17 @@ function boolean_indicator(f)
 end
 
 function boolean_indicator(f::BitArray)
-    f_new = similar(f, Float32)
-    for i in CartesianIndices(f_new)
-        @inbounds f_new[i] = f[i] ? 0.0f0 : 1.0f10
-    end
-    return f_new
-end
-
-@kernel function boolean_indicator_kernel(f, output)
-    i = @index(Global)
-    output[i] = ifelse(f[i] == 0, 1.0f10, 0.0f0)
+	f_new = similar(f, Float32)
+	for i in eachindex(f)
+		@inbounds f_new[i] = f[i] ? 0.0f0 : 1.0f10
+	end
+	return f_new
 end
 
 function boolean_indicator(f::AbstractGPUArray)
-    backend = get_backend(f)
-    output = similar(f, Float32)
-    kernel = boolean_indicator_kernel(backend)
-    kernel(f, output, ndrange=size(f))
-    KernelAbstractions.synchronize(backend)
-    return output
-end
-
-export boolean_indicator
+    f_new = similar(f, Float32)
+    AK.foreachindex(f) do i
+        @inbounds f_new[i] = f[i] == false ? 1.0f10 : 0.0f0
+    end
+    return f_new
+end
\ No newline at end of file

From 6c3e8929beb84c1fac36970ef66cf00d9c15c0b2 Mon Sep 17 00:00:00 2001
From: Dale Black <djblack@uci.edu>
Date: Thu, 26 Dec 2024 09:25:46 -0800
Subject: [PATCH 2/5] add AK.jl

---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index 4259a10..d8ac364 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,11 +4,13 @@ authors = ["Dale <djblack@uci.edu> and contributors"]
 version = "0.2.2"
 
 [deps]
+AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 
 [compat]
+AcceleratedKernels = "0.2"
 GPUArraysCore = "0.1"
 KernelAbstractions = "0.9"
 julia = "1.10"

From d2f67c7e8c48b50df9723b3f036b30a07bab602b Mon Sep 17 00:00:00 2001
From: Dale Black <djblack@uci.edu>
Date: Thu, 26 Dec 2024 10:46:02 -0800
Subject: [PATCH 3/5] replace with AcceleratedKernels.jl

---
 src/transform.jl  | 447 ++++++++++++++++++----------------------------
 src/utils.jl      |   1 -
 test/runtests.jl  |   8 +
 test/transform.jl | 402 ++++++++++++++++++++---------------------
 4 files changed, 383 insertions(+), 475 deletions(-)

diff --git a/src/transform.jl b/src/transform.jl
index ec3ee8d..8bc32bd 100644
--- a/src/transform.jl
+++ b/src/transform.jl
@@ -1,5 +1,8 @@
+import KernelAbstractions as KA
+import AcceleratedKernels as AK
 using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
-using KernelAbstractions
+
+export transform!, transform
 
 """
 ## `transform!`
@@ -37,267 +40,153 @@ transform!(f_bool, output, v, z)
 ```
 """
 function transform!(f::AbstractVector, output, v, z)
-	z[1] = -Inf32
-	z[2] = Inf32
-
-	k = 1
-	@inbounds for q in 2:length(f)
-		s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
-		while s ≤ z[k]
-			k -= 1
-			s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
-		end
-		k += 1
-		v[k] = q
-		z[k] = s
-		z[k+1] = Inf32
-	end
-
-	k = 1
-	@inbounds for q in 1:length(f)
-		while z[k+1] < q
-			k += 1
-		end
-		output[q] = (q - v[k])^2 + f[v[k]]
-	end
+    z[1] = -Inf32
+    z[2] = Inf32
+
+    k = 1
+    @inbounds for q in 2:length(f)
+        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
+        while s ≤ z[k]
+            k -= 1
+            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
+        end
+        k += 1
+        v[k] = q
+        z[k] = s
+        z[k+1] = Inf32
+    end
+
+    k = 1
+    @inbounds for q in 1:length(f)
+        while z[k+1] < q
+            k += 1
+        end
+        output[q] = (q - v[k])^2 + f[v[k]]
+    end
 end
 
 # 2D
-function transform!(img::AbstractMatrix, output, v, z; threaded = true)
-	if threaded
-		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
-			@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
-		end
-
-		copyto!(img, output)
-		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
-			@views transform!(
-				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1),
-			)
-		end
-	else
-		for i in CartesianIndices(@view(img[:, 1]))
-			@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
-		end
-
-		copyto!(img, output)
-		for j in CartesianIndices(@view(img[1, :]))
-			@views transform!(
-				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1),
-			)
-		end
-	end
-end
-
-# 3D
-function transform!(vol::AbstractArray{<:Real, 3}, output, v, z; threaded = true)
-	if threaded
-		Threads.@threads for i in CartesianIndices(@view(vol[:, 1, 1]))
-			@views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :])
-		end
-
-		copyto!(vol, output)
-		Threads.@threads for idx in CartesianIndices(@view(vol[1, :, :]))
-			j, k = Tuple(idx)
-			@views transform!(
-				vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1),
-			)
-		end
-	else
-		for i in CartesianIndices(@view(vol[:, 1, 1]))
-			@views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :])
-		end
-
-		copyto!(vol, output)
-		for idx in CartesianIndices(@view(vol[1, :, :]))
-			j, k = Tuple(idx)
-			@views transform!(
-				vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1),
-			)
-		end
-	end
-end
-
-# GPU (2D)
-@kernel function _first_pass_2D!(f, out, s2)
-	row, col = @index(Global, NTuple)
-
-	if f[row, col] < 0.5f0
-		ct = 1
-		curr_l = min(col - 1, s2 - col)
-		finished = false
-		while !finished && ct <= curr_l
-			if f[row, col-ct] > 0.5f0 || f[row, col+ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && ct < col
-			if f[row, col-ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && col + ct <= s2
-			if f[row, col+ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		if !finished
-			out[row, col] = 1.0f10
-		end
-	else
-		out[row, col] = 0.0f0
-	end
-end
-
-@kernel function _second_pass_2D!(org, out, s1, s2)
-	row, col = @index(Global, NTuple)
-
-	ct = 1
-	curr_l = sqrt(out[row, col])
-	while ct < curr_l && row + ct <= s1
-		temp = muladd(ct, ct, org[row+ct, col])
-		if temp < out[row, col]
-			out[row, col] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-
-	ct = 1
-	while ct < curr_l && row > ct
-		temp = muladd(ct, ct, org[row-ct, col])
-		if temp < out[row, col]
-			out[row, col] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
+function transform!(img::AbstractMatrix, output, v, z; threaded=true)
+    if threaded
+        Threads.@threads for i in eachindex(@view(img[:, 1]))
+            @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+        end
+
+        copyto!(img, output)
+
+        Threads.@threads for j in eachindex(@view(img[1, :]))
+            @views transform!(
+                img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
+            )
+        end
+    else
+        for i in eachindex(@view(img[:, 1]))
+            @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+        end
+
+        copyto!(img, output)
+		
+        for j in eachindex(@view(img[1, :]))
+            @views transform!(
+                img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
+            )
+        end
+    end
 end
 
-function transform!(img::AbstractGPUMatrix, output)
-	s1, s2 = size(img)
-	backend = get_backend(img)
-	kernel1! = _first_pass_2D!(backend)
-	kernel2! = _second_pass_2D!(backend)
-
-	kernel1!(img, output, s2, ndrange = (s1, s2))
-	copyto!(img, output)
-
-	kernel2!(img, output, s1, s2, ndrange = (s1, s2))
-	KernelAbstractions.synchronize(backend)
+function transform!(vol::AbstractArray{<:Real,3}, output, v, z; threaded=true)
+    if threaded
+        # X dimension
+        Threads.@threads for i in axes(vol, 1)
+            for j in axes(vol, 2)
+                @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Y dimension 
+        Threads.@threads for i in axes(vol, 1)
+            for k in axes(vol, 3)
+                @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Z dimension
+        Threads.@threads for j in axes(vol, 2)
+            for k in axes(vol, 3)
+                @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+            end
+        end
+    else
+        # X dimension
+        for i in axes(vol, 1)
+            for j in axes(vol, 2)
+                @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Y dimension
+        for i in axes(vol, 1)
+            for k in axes(vol, 3)
+                @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Z dimension
+        for j in axes(vol, 2)
+            for k in axes(vol, 3)
+                @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+            end
+        end
+    end
 end
 
-# GPU (3D)
-@kernel function _first_pass_3D!(f, out, s2)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 1D along dimension 2
-	if f[dim1, dim2, dim3] < 0.5f0
-		ct = 1
-		curr_l = min(dim2 - 1, s2 - dim2)
-		finished = false
-		while !finished && ct <= curr_l
-			if f[dim1, dim2-ct, dim3] > 0.5f0 || f[dim1, dim2+ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && ct < dim2
-			if f[dim1, dim2-ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && dim2 + ct <= s2
-			if f[dim1, dim2+ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		if !finished
-			out[dim1, dim2, dim3] = 1.0f10
-		end
-	else
-		out[dim1, dim2, dim3] = 0.0f0
-	end
-end
+function transform!(img::AbstractGPUMatrix, output, v, z)
+    AK.foreachindex(@view(img[:, 1])) do i
+        @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+    end
 
-@kernel function _second_pass_3D!(org, out, s1)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 2D along dimension 1
-	ct = 1
-	curr_l = sqrt(out[dim1, dim2, dim3])
-	while ct < curr_l && dim1 + ct <= s1
-		temp = muladd(ct, ct, org[dim1+ct, dim2, dim3])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-	ct = 1
-	while ct < curr_l && dim1 - ct > 0
-		temp = muladd(ct, ct, org[dim1-ct, dim2, dim3])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-end
+    copyto!(img, output)
 
-@kernel function _third_pass_3D!(org, out, s3)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 2D along dimension 3
-	ct = 1
-	curr_l = sqrt(out[dim1, dim2, dim3])
-	while ct < curr_l && dim3 + ct <= s3
-		temp = muladd(ct, ct, org[dim1, dim2, dim3+ct])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-	ct = 1
-	while ct < curr_l && ct < dim3
-		temp = muladd(ct, ct, org[dim1, dim2, dim3-ct])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
+    AK.foreachindex(@view(img[1, :])) do j
+        @views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
+    end
 end
 
-function transform!(img::AbstractGPUArray, output)
-	backend = get_backend(img)
-	s1, s2, s3 = size(img)
-	kernel1! = _first_pass_3D!(backend)
-	kernel2! = _second_pass_3D!(backend)
-	kernel3! = _third_pass_3D!(backend)
-
-	kernel1!(img, output, s2, ndrange = (s1, s2, s3))
-	copyto!(img, output)
-
-	kernel2!(img, output, s1, ndrange = (s1, s2, s3))
-	copyto!(img, output)
-
-	kernel3!(img, output, s3, ndrange = (s1, s2, s3))
-
-	KernelAbstractions.synchronize(backend)
+function transform!(vol::AbstractGPUArray{T,3}, output, v, z) where {T}
+    # X dimension
+    AK.foreachindex(@view(vol[:, 1, 1])) do i
+        for j in axes(vol, 2)
+            @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+        end
+    end
+
+    copyto!(vol, output)
+
+    # Y dimension
+    AK.foreachindex(@view(vol[:, 1, 1])) do i
+        for k in axes(vol, 3)
+            @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+        end
+    end
+
+    copyto!(vol, output)
+
+    # Z dimension 
+    AK.foreachindex(@view(vol[1, :, 1])) do j
+        for k in axes(vol, 3)
+            @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+        end
+    end
 end
 
-export transform!
-
 """
 ## `transform`
 
@@ -328,46 +217,54 @@ f_tfm = transform(f_bool)
 ```
 """
 function transform(f::AbstractVector)
-	output = similar(f, eltype(f))
-	v = ones(Int32, length(f))
-	z = ones(eltype(f), length(f) + 1)
+    output = similar(f, eltype(f))
+    v = ones(Int32, length(f))
+    z = ones(eltype(f), length(f) + 1)
 
-	transform!(f, output, v, z)
-	return output
+    transform!(f, output, v, z)
+    return output
 end
 
 # 2D
-function transform(img::AbstractMatrix; threaded = true)
-	output = similar(img, eltype(img))
-	v = ones(Int32, size(img))
-	z = ones(eltype(img), size(img) .+ 1)
+function transform(img::AbstractMatrix; threaded=true)
+    output = similar(img, eltype(img))
+    v = ones(Int32, size(img))
+    z = ones(eltype(img), size(img) .+ 1)
 
-	transform!(img, output, v, z; threaded = threaded)
-	return output
+    transform!(img, output, v, z; threaded=threaded)
+    return output
 end
 
 # 3D
-function transform(vol::AbstractArray{<:Real, 3}; threaded = true)
-	output = similar(vol, eltype(vol))
-	v = ones(Int32, size(vol))
-	z = ones(eltype(vol), size(vol) .+ 1)
+function transform(vol::AbstractArray{<:Real,3}; threaded=true)
+    output = similar(vol, eltype(vol))
+    v = ones(Int32, size(vol))
+    z = ones(eltype(vol), size(vol) .+ 1)
 
-	transform!(vol, output, v, z; threaded = threaded)
-	return output
+    transform!(vol, output, v, z; threaded=threaded)
+    return output
 end
 
 # GPU (2D)
 function transform(img::AbstractGPUMatrix)
-	output = similar(img, Float32)
-	transform!(img, output)
-	return output
+    backend = KA.get_backend(img)
+
+    output = similar(img, Float32)
+    v = KA.ones(backend, Int32, size(img))
+    z = KA.ones(backend, eltype(img), size(img) .+ 1)
+
+    transform!(img, output, v, z)
+    return output
 end
 
 # GPU (3D)
-function transform(img::AbstractGPUArray)
-	output = similar(img, Float32)
-	transform!(img, output)
-	return output
-end
+function transform(vol::AbstractGPUArray{T,3}) where {T}
+    backend = KA.get_backend(vol)
+
+    output = similar(vol, Float32)
+    v = KA.ones(backend, Int32, size(vol))
+    z = KA.ones(backend, eltype(vol), size(vol) .+ 1)
 
-export transform
+    transform!(vol, output, v, z)
+    return output
+end
diff --git a/src/utils.jl b/src/utils.jl
index 0fe2dbd..f4c00aa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,3 @@
-using KernelAbstractions
 using GPUArraysCore: AbstractGPUArray
 import AcceleratedKernels as AK
 
diff --git a/test/runtests.jl b/test/runtests.jl
index aab6648..a2f1deb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,6 +3,14 @@ using Test
 using KernelAbstractions
 using Random
 
+#= 
+To run the tests locally, and still test a GPU backend (e.g. Metal), use the following command:
+```
+using Pkg
+Pkg.test("DistanceTransforms", test_args=["Metal"])
+```
+=#
+
 AVAILABLE_GPU_BACKENDS = ["CUDA", "AMDGPU", "Metal", "oneAPI"]
 TEST_BACKENDS = filter(x->x in [AVAILABLE_GPU_BACKENDS; "CPU"], ARGS)
 
diff --git a/test/transform.jl b/test/transform.jl
index 7f45af5..027b185 100644
--- a/test/transform.jl
+++ b/test/transform.jl
@@ -3,206 +3,210 @@ using ImageMorphology: distance_transform, feature_transform
 test_transform(img) = distance_transform(feature_transform(Bool.(img)))
 
 @testset "transform!" begin
-	@testset "1D transform!" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "2D transform!" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-
-			# non-threaded
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z; threaded = false)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "3D transform!" begin
-		for n in [10, 100]
-			for test_idx in 1:5
-				img = rand([0f0, 1f0], n, n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-			
-			# non-threaded
-			for test_idx in 1:5
-				img = rand([0f0, 1f0], n, n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z; threaded = false)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-
-	if dev != Array
-		@testset "2D GPU transform!" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0f0, 1f0], n, n)
-					
-					img_gpu = dev(copy(img))
-					output = similar(img_gpu)
-					
-					transform!(img_gpu, output)
-					img_test = test_transform(img) .^ 2
-					
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-		@testset "3D GPU transform!" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0f0, 1f0], n, n, n)
-					
-					img_gpu = dev(copy(img))
-					output = similar(img_gpu)
-					
-					transform!(img_gpu, output)
-					img_test = test_transform(img) .^ 2
-					
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-	else
-		@info "No GPU available, skipping tests"
-	end
+    @testset "1D transform!" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "2D transform!" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z; threaded=false)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "3D transform!" begin
+        for n in [10, 100]
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z; threaded=false)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    if dev != Array
+        @testset "2D GPU transform!" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n))
+
+                    img_bool = boolean_indicator(img)
+                    output = similar(img, Float32)
+                    v = dev(ones(Int32, size(img)))
+                    z = dev(ones(Float32, size(img) .+ 1))
+
+                    transform!(img_bool, output, v, z)
+                    img_test = test_transform(Array(img)) .^ 2
+
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+        @testset "3D GPU transform!" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n, n))
+
+                    img_bool = boolean_indicator(img)
+                    output = similar(img, Float32)
+                    v = dev(ones(Int32, size(img)))
+                    z = dev(ones(Float32, size(img) .+ 1))
+
+                    transform!(img_bool, output, v, z)
+                    img_test = test_transform(Array(img)) .^ 2
+
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+    else
+        @info "No GPU available, skipping tests"
+    end
 end
 
 @testset "transform" begin
-	@testset "1D transform" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "2D transform" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-
-			# non-threaded
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool; threaded = false)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "3D transform" begin
-		for n in [10, 100]
-			for test_idx in 1:5
-				img = rand([0.0f0, 1.0f0], n, n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-			
-			# non-threaded
-			for test_idx in 1:5
-				img = rand([0.0f0, 1.0f0], n, n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool; threaded = false)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	if dev != Array
-		@testset "2D GPU transform" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0.0f0, 1.0f0], n, n)
-					img_gpu = dev(img)
-					output = transform(img_gpu)
-					img_test = test_transform(img) .^ 2
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-		
-		@testset "3D GPU transform" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0.0f0, 1.0f0], n, n, n)
-					img_gpu = dev(img)
-					output = transform(img_gpu)
-					img_test = test_transform(img) .^ 2
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-	else
-		@info "No GPU available, skipping tests"
-	end
+    @testset "1D transform" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "2D transform" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool; threaded=false)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "3D transform" begin
+        for n in [10, 100]
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool; threaded=false)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    if dev != Array
+        @testset "2D GPU transform" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n))
+                    img_bool = boolean_indicator(img)
+                    output = transform(img_bool)
+                    img_test = test_transform(Array(img)) .^ 2
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+
+        @testset "3D GPU transform" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n, n))
+                    img_bool = boolean_indicator(img)
+                    output = transform(img_bool)
+                    img_test = test_transform(Array(img)) .^ 2
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+    else
+        @info "No GPU available, skipping tests"
+    end
 end

From 0944221c03b741134c1789bcc5483fcdca07676d Mon Sep 17 00:00:00 2001
From: Dale Black <djblack@uci.edu>
Date: Thu, 26 Dec 2024 10:49:15 -0800
Subject: [PATCH 4/5] up CI

---
 .github/workflows/CI.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index aa6a86b..f9255e4 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -12,8 +12,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - "1.9"
           - "1.10"
+          - "1.11"
         os:
           - ubuntu-latest
           # - macOS-latest (can't use until GitHub upgrades to m-series macs)

From c3a6f45777f764e6e35ebf3deb05ef7ff1492c76 Mon Sep 17 00:00:00 2001
From: Dale Black <djblack@uci.edu>
Date: Thu, 26 Dec 2024 10:50:08 -0800
Subject: [PATCH 5/5] up CI cache

---
 .github/workflows/CI.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index f9255e4..8ee4f39 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -26,7 +26,7 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v4
         env:
           cache-name: cache-artifacts
         with: