diff --git a/src/transform.jl b/src/transform.jl
index ec3ee8d..8bc32bd 100644
--- a/src/transform.jl
+++ b/src/transform.jl
@@ -1,5 +1,8 @@
+import KernelAbstractions as KA
+import AcceleratedKernels as AK
 using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray
-using KernelAbstractions
+
+export transform!, transform
 
 """
 ## `transform!`
@@ -37,267 +40,153 @@ transform!(f_bool, output, v, z)
 ```
 """
 function transform!(f::AbstractVector, output, v, z)
-	z[1] = -Inf32
-	z[2] = Inf32
-
-	k = 1
-	@inbounds for q in 2:length(f)
-		s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
-		while s ≤ z[k]
-			k -= 1
-			s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
-		end
-		k += 1
-		v[k] = q
-		z[k] = s
-		z[k+1] = Inf32
-	end
-
-	k = 1
-	@inbounds for q in 1:length(f)
-		while z[k+1] < q
-			k += 1
-		end
-		output[q] = (q - v[k])^2 + f[v[k]]
-	end
+    z[1] = -Inf32
+    z[2] = Inf32
+
+    k = 1
+    @inbounds for q in 2:length(f)
+        s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
+        while s ≤ z[k]
+            k -= 1
+            s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k])
+        end
+        k += 1
+        v[k] = q
+        z[k] = s
+        z[k+1] = Inf32
+    end
+
+    k = 1
+    @inbounds for q in 1:length(f)
+        while z[k+1] < q
+            k += 1
+        end
+        output[q] = (q - v[k])^2 + f[v[k]]
+    end
 end
 
 # 2D
-function transform!(img::AbstractMatrix, output, v, z; threaded = true)
-	if threaded
-		Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
-			@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
-		end
-
-		copyto!(img, output)
-		Threads.@threads for j in CartesianIndices(@view(img[1, :]))
-			@views transform!(
-				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1),
-			)
-		end
-	else
-		for i in CartesianIndices(@view(img[:, 1]))
-			@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
-		end
-
-		copyto!(img, output)
-		for j in CartesianIndices(@view(img[1, :]))
-			@views transform!(
-				img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1),
-			)
-		end
-	end
-end
-
-# 3D
-function transform!(vol::AbstractArray{<:Real, 3}, output, v, z; threaded = true)
-	if threaded
-		Threads.@threads for i in CartesianIndices(@view(vol[:, 1, 1]))
-			@views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :])
-		end
-
-		copyto!(vol, output)
-		Threads.@threads for idx in CartesianIndices(@view(vol[1, :, :]))
-			j, k = Tuple(idx)
-			@views transform!(
-				vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1),
-			)
-		end
-	else
-		for i in CartesianIndices(@view(vol[:, 1, 1]))
-			@views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :])
-		end
-
-		copyto!(vol, output)
-		for idx in CartesianIndices(@view(vol[1, :, :]))
-			j, k = Tuple(idx)
-			@views transform!(
-				vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1),
-			)
-		end
-	end
-end
-
-# GPU (2D)
-@kernel function _first_pass_2D!(f, out, s2)
-	row, col = @index(Global, NTuple)
-
-	if f[row, col] < 0.5f0
-		ct = 1
-		curr_l = min(col - 1, s2 - col)
-		finished = false
-		while !finished && ct <= curr_l
-			if f[row, col-ct] > 0.5f0 || f[row, col+ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && ct < col
-			if f[row, col-ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && col + ct <= s2
-			if f[row, col+ct] > 0.5f0
-				out[row, col] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		if !finished
-			out[row, col] = 1.0f10
-		end
-	else
-		out[row, col] = 0.0f0
-	end
-end
-
-@kernel function _second_pass_2D!(org, out, s1, s2)
-	row, col = @index(Global, NTuple)
-
-	ct = 1
-	curr_l = sqrt(out[row, col])
-	while ct < curr_l && row + ct <= s1
-		temp = muladd(ct, ct, org[row+ct, col])
-		if temp < out[row, col]
-			out[row, col] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-
-	ct = 1
-	while ct < curr_l && row > ct
-		temp = muladd(ct, ct, org[row-ct, col])
-		if temp < out[row, col]
-			out[row, col] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
+function transform!(img::AbstractMatrix, output, v, z; threaded=true)
+    if threaded
+        Threads.@threads for i in eachindex(@view(img[:, 1]))
+            @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+        end
+
+        copyto!(img, output)
+
+        Threads.@threads for j in eachindex(@view(img[1, :]))
+            @views transform!(
+                img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
+            )
+        end
+    else
+        for i in eachindex(@view(img[:, 1]))
+            @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+        end
+
+        copyto!(img, output)
+		
+        for j in eachindex(@view(img[1, :]))
+            @views transform!(
+                img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
+            )
+        end
+    end
 end
 
-function transform!(img::AbstractGPUMatrix, output)
-	s1, s2 = size(img)
-	backend = get_backend(img)
-	kernel1! = _first_pass_2D!(backend)
-	kernel2! = _second_pass_2D!(backend)
-
-	kernel1!(img, output, s2, ndrange = (s1, s2))
-	copyto!(img, output)
-
-	kernel2!(img, output, s1, s2, ndrange = (s1, s2))
-	KernelAbstractions.synchronize(backend)
+function transform!(vol::AbstractArray{<:Real,3}, output, v, z; threaded=true)
+    if threaded
+        # X dimension
+        Threads.@threads for i in axes(vol, 1)
+            for j in axes(vol, 2)
+                @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Y dimension 
+        Threads.@threads for i in axes(vol, 1)
+            for k in axes(vol, 3)
+                @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Z dimension
+        Threads.@threads for j in axes(vol, 2)
+            for k in axes(vol, 3)
+                @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+            end
+        end
+    else
+        # X dimension
+        for i in axes(vol, 1)
+            for j in axes(vol, 2)
+                @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Y dimension
+        for i in axes(vol, 1)
+            for k in axes(vol, 3)
+                @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+            end
+        end
+
+        copyto!(vol, output)
+
+        # Z dimension
+        for j in axes(vol, 2)
+            for k in axes(vol, 3)
+                @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+            end
+        end
+    end
 end
 
-# GPU (3D)
-@kernel function _first_pass_3D!(f, out, s2)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 1D along dimension 2
-	if f[dim1, dim2, dim3] < 0.5f0
-		ct = 1
-		curr_l = min(dim2 - 1, s2 - dim2)
-		finished = false
-		while !finished && ct <= curr_l
-			if f[dim1, dim2-ct, dim3] > 0.5f0 || f[dim1, dim2+ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && ct < dim2
-			if f[dim1, dim2-ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		while !finished && dim2 + ct <= s2
-			if f[dim1, dim2+ct, dim3] > 0.5f0
-				out[dim1, dim2, dim3] = ct * ct
-				finished = true
-			end
-			ct += 1
-		end
-		if !finished
-			out[dim1, dim2, dim3] = 1.0f10
-		end
-	else
-		out[dim1, dim2, dim3] = 0.0f0
-	end
-end
+function transform!(img::AbstractGPUMatrix, output, v, z)
+    AK.foreachindex(@view(img[:, 1])) do i
+        @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
+    end
 
-@kernel function _second_pass_3D!(org, out, s1)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 2D along dimension 1
-	ct = 1
-	curr_l = sqrt(out[dim1, dim2, dim3])
-	while ct < curr_l && dim1 + ct <= s1
-		temp = muladd(ct, ct, org[dim1+ct, dim2, dim3])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-	ct = 1
-	while ct < curr_l && dim1 - ct > 0
-		temp = muladd(ct, ct, org[dim1-ct, dim2, dim3])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-end
+    copyto!(img, output)
 
-@kernel function _third_pass_3D!(org, out, s3)
-	dim1, dim2, dim3 = @index(Global, NTuple)
-	# 2D along dimension 3
-	ct = 1
-	curr_l = sqrt(out[dim1, dim2, dim3])
-	while ct < curr_l && dim3 + ct <= s3
-		temp = muladd(ct, ct, org[dim1, dim2, dim3+ct])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
-	ct = 1
-	while ct < curr_l && ct < dim3
-		temp = muladd(ct, ct, org[dim1, dim2, dim3-ct])
-		if temp < out[dim1, dim2, dim3]
-			out[dim1, dim2, dim3] = temp
-			curr_l = sqrt(temp)
-		end
-		ct += 1
-	end
+    AK.foreachindex(@view(img[1, :])) do j
+        @views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
+    end
 end
 
-function transform!(img::AbstractGPUArray, output)
-	backend = get_backend(img)
-	s1, s2, s3 = size(img)
-	kernel1! = _first_pass_3D!(backend)
-	kernel2! = _second_pass_3D!(backend)
-	kernel3! = _third_pass_3D!(backend)
-
-	kernel1!(img, output, s2, ndrange = (s1, s2, s3))
-	copyto!(img, output)
-
-	kernel2!(img, output, s1, ndrange = (s1, s2, s3))
-	copyto!(img, output)
-
-	kernel3!(img, output, s3, ndrange = (s1, s2, s3))
-
-	KernelAbstractions.synchronize(backend)
+function transform!(vol::AbstractGPUArray{T,3}, output, v, z) where {T}
+    # X dimension
+    AK.foreachindex(@view(vol[:, 1, 1])) do i
+        for j in axes(vol, 2)
+            @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :])
+        end
+    end
+
+    copyto!(vol, output)
+
+    # Y dimension
+    AK.foreachindex(@view(vol[:, 1, 1])) do i
+        for k in axes(vol, 3)
+            @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1))
+        end
+    end
+
+    copyto!(vol, output)
+
+    # Z dimension 
+    AK.foreachindex(@view(vol[1, :, 1])) do j
+        for k in axes(vol, 3)
+            @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1))
+        end
+    end
 end
 
-export transform!
-
 """
 ## `transform`
 
@@ -328,46 +217,54 @@ f_tfm = transform(f_bool)
 ```
 """
 function transform(f::AbstractVector)
-	output = similar(f, eltype(f))
-	v = ones(Int32, length(f))
-	z = ones(eltype(f), length(f) + 1)
+    output = similar(f, eltype(f))
+    v = ones(Int32, length(f))
+    z = ones(eltype(f), length(f) + 1)
 
-	transform!(f, output, v, z)
-	return output
+    transform!(f, output, v, z)
+    return output
 end
 
 # 2D
-function transform(img::AbstractMatrix; threaded = true)
-	output = similar(img, eltype(img))
-	v = ones(Int32, size(img))
-	z = ones(eltype(img), size(img) .+ 1)
+function transform(img::AbstractMatrix; threaded=true)
+    output = similar(img, eltype(img))
+    v = ones(Int32, size(img))
+    z = ones(eltype(img), size(img) .+ 1)
 
-	transform!(img, output, v, z; threaded = threaded)
-	return output
+    transform!(img, output, v, z; threaded=threaded)
+    return output
 end
 
 # 3D
-function transform(vol::AbstractArray{<:Real, 3}; threaded = true)
-	output = similar(vol, eltype(vol))
-	v = ones(Int32, size(vol))
-	z = ones(eltype(vol), size(vol) .+ 1)
+function transform(vol::AbstractArray{<:Real,3}; threaded=true)
+    output = similar(vol, eltype(vol))
+    v = ones(Int32, size(vol))
+    z = ones(eltype(vol), size(vol) .+ 1)
 
-	transform!(vol, output, v, z; threaded = threaded)
-	return output
+    transform!(vol, output, v, z; threaded=threaded)
+    return output
 end
 
 # GPU (2D)
 function transform(img::AbstractGPUMatrix)
-	output = similar(img, Float32)
-	transform!(img, output)
-	return output
+    backend = KA.get_backend(img)
+
+    output = similar(img, Float32)
+    v = KA.ones(backend, Int32, size(img))
+    z = KA.ones(backend, eltype(img), size(img) .+ 1)
+
+    transform!(img, output, v, z)
+    return output
 end
 
 # GPU (3D)
-function transform(img::AbstractGPUArray)
-	output = similar(img, Float32)
-	transform!(img, output)
-	return output
-end
+function transform(vol::AbstractGPUArray{T,3}) where {T}
+    backend = KA.get_backend(vol)
+
+    output = similar(vol, Float32)
+    v = KA.ones(backend, Int32, size(vol))
+    z = KA.ones(backend, eltype(vol), size(vol) .+ 1)
 
-export transform
+    transform!(vol, output, v, z)
+    return output
+end
diff --git a/src/utils.jl b/src/utils.jl
index 0fe2dbd..f4c00aa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,3 @@
-using KernelAbstractions
 using GPUArraysCore: AbstractGPUArray
 import AcceleratedKernels as AK
 
diff --git a/test/runtests.jl b/test/runtests.jl
index aab6648..a2f1deb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,6 +3,14 @@ using Test
 using KernelAbstractions
 using Random
 
+#= 
+To run the tests locally, and still test a GPU backend (e.g. Metal), use the following command:
+```
+using Pkg
+Pkg.test("DistanceTransforms", test_args=["Metal"])
+```
+=#
+
 AVAILABLE_GPU_BACKENDS = ["CUDA", "AMDGPU", "Metal", "oneAPI"]
 TEST_BACKENDS = filter(x->x in [AVAILABLE_GPU_BACKENDS; "CPU"], ARGS)
 
diff --git a/test/transform.jl b/test/transform.jl
index 7f45af5..027b185 100644
--- a/test/transform.jl
+++ b/test/transform.jl
@@ -3,206 +3,210 @@ using ImageMorphology: distance_transform, feature_transform
 test_transform(img) = distance_transform(feature_transform(Bool.(img)))
 
 @testset "transform!" begin
-	@testset "1D transform!" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "2D transform!" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-
-			# non-threaded
-			for test_idx in 1:20
-				img = rand([0f0, 1f0], n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z; threaded = false)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "3D transform!" begin
-		for n in [10, 100]
-			for test_idx in 1:5
-				img = rand([0f0, 1f0], n, n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-			
-			# non-threaded
-			for test_idx in 1:5
-				img = rand([0f0, 1f0], n, n, n)
-				
-				img_bool = boolean_indicator(img)
-				output = similar(img, Float32)
-				v = ones(Int32, size(img))
-				z = ones(Float32, size(img) .+ 1)
-				
-				transform!(img_bool, output, v, z; threaded = false)
-				img_test = test_transform(img) .^ 2
-				
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-
-	if dev != Array
-		@testset "2D GPU transform!" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0f0, 1f0], n, n)
-					
-					img_gpu = dev(copy(img))
-					output = similar(img_gpu)
-					
-					transform!(img_gpu, output)
-					img_test = test_transform(img) .^ 2
-					
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-		@testset "3D GPU transform!" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0f0, 1f0], n, n, n)
-					
-					img_gpu = dev(copy(img))
-					output = similar(img_gpu)
-					
-					transform!(img_gpu, output)
-					img_test = test_transform(img) .^ 2
-					
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-	else
-		@info "No GPU available, skipping tests"
-	end
+    @testset "1D transform!" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "2D transform!" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z; threaded=false)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "3D transform!" begin
+        for n in [10, 100]
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+
+                img_bool = boolean_indicator(img)
+                output = similar(img, Float32)
+                v = ones(Int32, size(img))
+                z = ones(Float32, size(img) .+ 1)
+
+                transform!(img_bool, output, v, z; threaded=false)
+                img_test = test_transform(img) .^ 2
+
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    if dev != Array
+        @testset "2D GPU transform!" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n))
+
+                    img_bool = boolean_indicator(img)
+                    output = similar(img, Float32)
+                    v = dev(ones(Int32, size(img)))
+                    z = dev(ones(Float32, size(img) .+ 1))
+
+                    transform!(img_bool, output, v, z)
+                    img_test = test_transform(Array(img)) .^ 2
+
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+        @testset "3D GPU transform!" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n, n))
+
+                    img_bool = boolean_indicator(img)
+                    output = similar(img, Float32)
+                    v = dev(ones(Int32, size(img)))
+                    z = dev(ones(Float32, size(img) .+ 1))
+
+                    transform!(img_bool, output, v, z)
+                    img_test = test_transform(Array(img)) .^ 2
+
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+    else
+        @info "No GPU available, skipping tests"
+    end
 end
 
 @testset "transform" begin
-	@testset "1D transform" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "2D transform" begin
-		for n in [10, 50, 100]
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-
-			# non-threaded
-			for test_idx in 1:20
-				img = rand([0.0f0, 1.0f0], n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool; threaded = false)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	@testset "3D transform" begin
-		for n in [10, 100]
-			for test_idx in 1:5
-				img = rand([0.0f0, 1.0f0], n, n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-			
-			# non-threaded
-			for test_idx in 1:5
-				img = rand([0.0f0, 1.0f0], n, n, n)
-				img_bool = boolean_indicator(img)
-				output = transform(img_bool; threaded = false)
-				img_test = test_transform(img) .^ 2
-				@test Array(output) ≈ img_test
-			end
-		end
-	end
-	
-	if dev != Array
-		@testset "2D GPU transform" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0.0f0, 1.0f0], n, n)
-					img_gpu = dev(img)
-					output = transform(img_gpu)
-					img_test = test_transform(img) .^ 2
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-		
-		@testset "3D GPU transform" begin
-			for n in [10, 50, 100]
-				for test_idx in 1:20
-					img = rand([0.0f0, 1.0f0], n, n, n)
-					img_gpu = dev(img)
-					output = transform(img_gpu)
-					img_test = test_transform(img) .^ 2
-					@test Array(output) ≈ img_test
-				end
-			end
-		end
-	else
-		@info "No GPU available, skipping tests"
-	end
+    @testset "1D transform" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "2D transform" begin
+        for n in [10, 50, 100]
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:20
+                img = rand([0.0f0, 1.0f0], n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool; threaded=false)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    @testset "3D transform" begin
+        for n in [10, 100]
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+
+            # non-threaded
+            for test_idx in 1:5
+                img = rand([0.0f0, 1.0f0], n, n, n)
+                img_bool = boolean_indicator(img)
+                output = transform(img_bool; threaded=false)
+                img_test = test_transform(img) .^ 2
+                @test Array(output) ≈ img_test
+            end
+        end
+    end
+
+    if dev != Array
+        @testset "2D GPU transform" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n))
+                    img_bool = boolean_indicator(img)
+                    output = transform(img_bool)
+                    img_test = test_transform(Array(img)) .^ 2
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+
+        @testset "3D GPU transform" begin
+            for n in [10, 50, 100]
+                for test_idx in 1:20
+                    img = dev(rand([0.0f0, 1.0f0], n, n, n))
+                    img_bool = boolean_indicator(img)
+                    output = transform(img_bool)
+                    img_test = test_transform(Array(img)) .^ 2
+                    @test Array(output) ≈ img_test
+                end
+            end
+        end
+    else
+        @info "No GPU available, skipping tests"
+    end
 end