diff --git a/src/transform.jl b/src/transform.jl index ec3ee8d..8bc32bd 100644 --- a/src/transform.jl +++ b/src/transform.jl @@ -1,5 +1,8 @@ +import KernelAbstractions as KA +import AcceleratedKernels as AK using GPUArraysCore: AbstractGPUVector, AbstractGPUMatrix, AbstractGPUArray -using KernelAbstractions + +export transform!, transform """ ## `transform!` @@ -37,267 +40,153 @@ transform!(f_bool, output, v, z) ``` """ function transform!(f::AbstractVector, output, v, z) - z[1] = -Inf32 - z[2] = Inf32 - - k = 1 - @inbounds for q in 2:length(f) - s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k]) - while s ≤ z[k] - k -= 1 - s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k]) - end - k += 1 - v[k] = q - z[k] = s - z[k+1] = Inf32 - end - - k = 1 - @inbounds for q in 1:length(f) - while z[k+1] < q - k += 1 - end - output[q] = (q - v[k])^2 + f[v[k]] - end + z[1] = -Inf32 + z[2] = Inf32 + + k = 1 + @inbounds for q in 2:length(f) + s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k]) + while s ≤ z[k] + k -= 1 + s = ((f[q] + q^2) - (f[v[k]] + v[k]^2)) / (2 * q - 2 * v[k]) + end + k += 1 + v[k] = q + z[k] = s + z[k+1] = Inf32 + end + + k = 1 + @inbounds for q in 1:length(f) + while z[k+1] < q + k += 1 + end + output[q] = (q - v[k])^2 + f[v[k]] + end end # 2D -function transform!(img::AbstractMatrix, output, v, z; threaded = true) - if threaded - Threads.@threads for i in CartesianIndices(@view(img[:, 1])) - @views transform!(img[i, :], output[i, :], v[i, :], z[i, :]) - end - - copyto!(img, output) - Threads.@threads for j in CartesianIndices(@view(img[1, :])) - @views transform!( - img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1), - ) - end - else - for i in CartesianIndices(@view(img[:, 1])) - @views transform!(img[i, :], output[i, :], v[i, :], z[i, :]) - end - - copyto!(img, output) - for j in CartesianIndices(@view(img[1, :])) - @views transform!( - img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1), - ) - end - end -end - -# 3D -function transform!(vol::AbstractArray{<:Real, 3}, output, v, z; threaded = true) - if threaded - Threads.@threads for i in CartesianIndices(@view(vol[:, 1, 1])) - @views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :]) - end - - copyto!(vol, output) - Threads.@threads for idx in CartesianIndices(@view(vol[1, :, :])) - j, k = Tuple(idx) - @views transform!( - vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1), - ) - end - else - for i in CartesianIndices(@view(vol[:, 1, 1])) - @views transform!(vol[i, :, :], output[i, :, :], v[i, :, :], z[i, :, :]) - end - - copyto!(vol, output) - for idx in CartesianIndices(@view(vol[1, :, :])) - j, k = Tuple(idx) - @views transform!( - vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1), - ) - end - end -end - -# GPU (2D) -@kernel function _first_pass_2D!(f, out, s2) - row, col = @index(Global, NTuple) - - if f[row, col] < 0.5f0 - ct = 1 - curr_l = min(col - 1, s2 - col) - finished = false - while !finished && ct <= curr_l - if f[row, col-ct] > 0.5f0 || f[row, col+ct] > 0.5f0 - out[row, col] = ct * ct - finished = true - end - ct += 1 - end - while !finished && ct < col - if f[row, col-ct] > 0.5f0 - out[row, col] = ct * ct - finished = true - end - ct += 1 - end - while !finished && col + ct <= s2 - if f[row, col+ct] > 0.5f0 - out[row, col] = ct * ct - finished = true - end - ct += 1 - end - if !finished - out[row, col] = 1.0f10 - end - else - out[row, col] = 0.0f0 - end -end - -@kernel function _second_pass_2D!(org, out, s1, s2) - row, col = @index(Global, NTuple) - - ct = 1 - curr_l = sqrt(out[row, col]) - while ct < curr_l && row + ct <= s1 - temp = muladd(ct, ct, org[row+ct, col]) - if temp < out[row, col] - out[row, col] = temp - curr_l = sqrt(temp) - end - ct += 1 - end - - ct = 1 - while ct < curr_l && row > ct - temp = muladd(ct, ct, org[row-ct, col]) - if temp < out[row, col] - out[row, col] = temp - curr_l = sqrt(temp) - end - ct += 1 - end +function transform!(img::AbstractMatrix, output, v, z; threaded=true) + if threaded + Threads.@threads for i in eachindex(@view(img[:, 1])) + @views transform!(img[i, :], output[i, :], v[i, :], z[i, :]) + end + + copyto!(img, output) + + Threads.@threads for j in eachindex(@view(img[1, :])) + @views transform!( + img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1) + ) + end + else + for i in eachindex(@view(img[:, 1])) + @views transform!(img[i, :], output[i, :], v[i, :], z[i, :]) + end + + copyto!(img, output) + + for j in eachindex(@view(img[1, :])) + @views transform!( + img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1) + ) + end + end end -function transform!(img::AbstractGPUMatrix, output) - s1, s2 = size(img) - backend = get_backend(img) - kernel1! = _first_pass_2D!(backend) - kernel2! = _second_pass_2D!(backend) - - kernel1!(img, output, s2, ndrange = (s1, s2)) - copyto!(img, output) - - kernel2!(img, output, s1, s2, ndrange = (s1, s2)) - KernelAbstractions.synchronize(backend) +function transform!(vol::AbstractArray{<:Real,3}, output, v, z; threaded=true) + if threaded + # X dimension + Threads.@threads for i in axes(vol, 1) + for j in axes(vol, 2) + @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :]) + end + end + + copyto!(vol, output) + + # Y dimension + Threads.@threads for i in axes(vol, 1) + for k in axes(vol, 3) + @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1)) + end + end + + copyto!(vol, output) + + # Z dimension + Threads.@threads for j in axes(vol, 2) + for k in axes(vol, 3) + @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1)) + end + end + else + # X dimension + for i in axes(vol, 1) + for j in axes(vol, 2) + @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :]) + end + end + + copyto!(vol, output) + + # Y dimension + for i in axes(vol, 1) + for k in axes(vol, 3) + @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1)) + end + end + + copyto!(vol, output) + + # Z dimension + for j in axes(vol, 2) + for k in axes(vol, 3) + @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1)) + end + end + end end -# GPU (3D) -@kernel function _first_pass_3D!(f, out, s2) - dim1, dim2, dim3 = @index(Global, NTuple) - # 1D along dimension 2 - if f[dim1, dim2, dim3] < 0.5f0 - ct = 1 - curr_l = min(dim2 - 1, s2 - dim2) - finished = false - while !finished && ct <= curr_l - if f[dim1, dim2-ct, dim3] > 0.5f0 || f[dim1, dim2+ct, dim3] > 0.5f0 - out[dim1, dim2, dim3] = ct * ct - finished = true - end - ct += 1 - end - while !finished && ct < dim2 - if f[dim1, dim2-ct, dim3] > 0.5f0 - out[dim1, dim2, dim3] = ct * ct - finished = true - end - ct += 1 - end - while !finished && dim2 + ct <= s2 - if f[dim1, dim2+ct, dim3] > 0.5f0 - out[dim1, dim2, dim3] = ct * ct - finished = true - end - ct += 1 - end - if !finished - out[dim1, dim2, dim3] = 1.0f10 - end - else - out[dim1, dim2, dim3] = 0.0f0 - end -end +function transform!(img::AbstractGPUMatrix, output, v, z) + AK.foreachindex(@view(img[:, 1])) do i + @views transform!(img[i, :], output[i, :], v[i, :], z[i, :]) + end -@kernel function _second_pass_3D!(org, out, s1) - dim1, dim2, dim3 = @index(Global, NTuple) - # 2D along dimension 1 - ct = 1 - curr_l = sqrt(out[dim1, dim2, dim3]) - while ct < curr_l && dim1 + ct <= s1 - temp = muladd(ct, ct, org[dim1+ct, dim2, dim3]) - if temp < out[dim1, dim2, dim3] - out[dim1, dim2, dim3] = temp - curr_l = sqrt(temp) - end - ct += 1 - end - ct = 1 - while ct < curr_l && dim1 - ct > 0 - temp = muladd(ct, ct, org[dim1-ct, dim2, dim3]) - if temp < out[dim1, dim2, dim3] - out[dim1, dim2, dim3] = temp - curr_l = sqrt(temp) - end - ct += 1 - end -end + copyto!(img, output) -@kernel function _third_pass_3D!(org, out, s3) - dim1, dim2, dim3 = @index(Global, NTuple) - # 2D along dimension 3 - ct = 1 - curr_l = sqrt(out[dim1, dim2, dim3]) - while ct < curr_l && dim3 + ct <= s3 - temp = muladd(ct, ct, org[dim1, dim2, dim3+ct]) - if temp < out[dim1, dim2, dim3] - out[dim1, dim2, dim3] = temp - curr_l = sqrt(temp) - end - ct += 1 - end - ct = 1 - while ct < curr_l && ct < dim3 - temp = muladd(ct, ct, org[dim1, dim2, dim3-ct]) - if temp < out[dim1, dim2, dim3] - out[dim1, dim2, dim3] = temp - curr_l = sqrt(temp) - end - ct += 1 - end + AK.foreachindex(@view(img[1, :])) do j + @views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)) + end end -function transform!(img::AbstractGPUArray, output) - backend = get_backend(img) - s1, s2, s3 = size(img) - kernel1! = _first_pass_3D!(backend) - kernel2! = _second_pass_3D!(backend) - kernel3! = _third_pass_3D!(backend) - - kernel1!(img, output, s2, ndrange = (s1, s2, s3)) - copyto!(img, output) - - kernel2!(img, output, s1, ndrange = (s1, s2, s3)) - copyto!(img, output) - - kernel3!(img, output, s3, ndrange = (s1, s2, s3)) - - KernelAbstractions.synchronize(backend) +function transform!(vol::AbstractGPUArray{T,3}, output, v, z) where {T} + # X dimension + AK.foreachindex(@view(vol[:, 1, 1])) do i + for j in axes(vol, 2) + @views transform!(vol[i, j, :], output[i, j, :], v[i, j, :], z[i, j, :]) + end + end + + copyto!(vol, output) + + # Y dimension + AK.foreachindex(@view(vol[:, 1, 1])) do i + for k in axes(vol, 3) + @views transform!(vol[i, :, k], output[i, :, k], fill!(v[i, :, k], 1), fill!(z[i, :, k], 1)) + end + end + + copyto!(vol, output) + + # Z dimension + AK.foreachindex(@view(vol[1, :, 1])) do j + for k in axes(vol, 3) + @views transform!(vol[:, j, k], output[:, j, k], fill!(v[:, j, k], 1), fill!(z[:, j, k], 1)) + end + end end -export transform! - """ ## `transform` @@ -328,46 +217,54 @@ f_tfm = transform(f_bool) ``` """ function transform(f::AbstractVector) - output = similar(f, eltype(f)) - v = ones(Int32, length(f)) - z = ones(eltype(f), length(f) + 1) + output = similar(f, eltype(f)) + v = ones(Int32, length(f)) + z = ones(eltype(f), length(f) + 1) - transform!(f, output, v, z) - return output + transform!(f, output, v, z) + return output end # 2D -function transform(img::AbstractMatrix; threaded = true) - output = similar(img, eltype(img)) - v = ones(Int32, size(img)) - z = ones(eltype(img), size(img) .+ 1) +function transform(img::AbstractMatrix; threaded=true) + output = similar(img, eltype(img)) + v = ones(Int32, size(img)) + z = ones(eltype(img), size(img) .+ 1) - transform!(img, output, v, z; threaded = threaded) - return output + transform!(img, output, v, z; threaded=threaded) + return output end # 3D -function transform(vol::AbstractArray{<:Real, 3}; threaded = true) - output = similar(vol, eltype(vol)) - v = ones(Int32, size(vol)) - z = ones(eltype(vol), size(vol) .+ 1) +function transform(vol::AbstractArray{<:Real,3}; threaded=true) + output = similar(vol, eltype(vol)) + v = ones(Int32, size(vol)) + z = ones(eltype(vol), size(vol) .+ 1) - transform!(vol, output, v, z; threaded = threaded) - return output + transform!(vol, output, v, z; threaded=threaded) + return output end # GPU (2D) function transform(img::AbstractGPUMatrix) - output = similar(img, Float32) - transform!(img, output) - return output + backend = KA.get_backend(img) + + output = similar(img, Float32) + v = KA.ones(backend, Int32, size(img)) + z = KA.ones(backend, eltype(img), size(img) .+ 1) + + transform!(img, output, v, z) + return output end # GPU (3D) -function transform(img::AbstractGPUArray) - output = similar(img, Float32) - transform!(img, output) - return output -end +function transform(vol::AbstractGPUArray{T,3}) where {T} + backend = KA.get_backend(vol) + + output = similar(vol, Float32) + v = KA.ones(backend, Int32, size(vol)) + z = KA.ones(backend, eltype(vol), size(vol) .+ 1) -export transform + transform!(vol, output, v, z) + return output +end diff --git a/src/utils.jl b/src/utils.jl index 0fe2dbd..f4c00aa 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,3 @@ -using KernelAbstractions using GPUArraysCore: AbstractGPUArray import AcceleratedKernels as AK diff --git a/test/runtests.jl b/test/runtests.jl index aab6648..a2f1deb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,6 +3,14 @@ using Test using KernelAbstractions using Random +#= +To run the tests locally, and still test a GPU backend (e.g. Metal), use the following command: +``` +using Pkg +Pkg.test("DistanceTransforms", test_args=["Metal"]) +``` +=# + AVAILABLE_GPU_BACKENDS = ["CUDA", "AMDGPU", "Metal", "oneAPI"] TEST_BACKENDS = filter(x->x in [AVAILABLE_GPU_BACKENDS; "CPU"], ARGS) diff --git a/test/transform.jl b/test/transform.jl index 7f45af5..027b185 100644 --- a/test/transform.jl +++ b/test/transform.jl @@ -3,206 +3,210 @@ using ImageMorphology: distance_transform, feature_transform test_transform(img) = distance_transform(feature_transform(Bool.(img))) @testset "transform!" begin - @testset "1D transform!" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0f0, 1f0], n) - - img_bool = boolean_indicator(img) - output = similar(img, Float32) - v = ones(Int32, size(img)) - z = ones(Float32, size(img) .+ 1) - - transform!(img_bool, output, v, z) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - end - end - - @testset "2D transform!" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0f0, 1f0], n, n) - - img_bool = boolean_indicator(img) - output = similar(img, Float32) - v = ones(Int32, size(img)) - z = ones(Float32, size(img) .+ 1) - - transform!(img_bool, output, v, z) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - - # non-threaded - for test_idx in 1:20 - img = rand([0f0, 1f0], n, n) - - img_bool = boolean_indicator(img) - output = similar(img, Float32) - v = ones(Int32, size(img)) - z = ones(Float32, size(img) .+ 1) - - transform!(img_bool, output, v, z; threaded = false) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - end - end - - @testset "3D transform!" begin - for n in [10, 100] - for test_idx in 1:5 - img = rand([0f0, 1f0], n, n, n) - - img_bool = boolean_indicator(img) - output = similar(img, Float32) - v = ones(Int32, size(img)) - z = ones(Float32, size(img) .+ 1) - - transform!(img_bool, output, v, z) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - - # non-threaded - for test_idx in 1:5 - img = rand([0f0, 1f0], n, n, n) - - img_bool = boolean_indicator(img) - output = similar(img, Float32) - v = ones(Int32, size(img)) - z = ones(Float32, size(img) .+ 1) - - transform!(img_bool, output, v, z; threaded = false) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - end - end - - if dev != Array - @testset "2D GPU transform!" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0f0, 1f0], n, n) - - img_gpu = dev(copy(img)) - output = similar(img_gpu) - - transform!(img_gpu, output) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - end - end - @testset "3D GPU transform!" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0f0, 1f0], n, n, n) - - img_gpu = dev(copy(img)) - output = similar(img_gpu) - - transform!(img_gpu, output) - img_test = test_transform(img) .^ 2 - - @test Array(output) ≈ img_test - end - end - end - else - @info "No GPU available, skipping tests" - end + @testset "1D transform!" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = ones(Int32, size(img)) + z = ones(Float32, size(img) .+ 1) + + transform!(img_bool, output, v, z) + img_test = test_transform(img) .^ 2 + + @test Array(output) ≈ img_test + end + end + end + + @testset "2D transform!" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n, n) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = ones(Int32, size(img)) + z = ones(Float32, size(img) .+ 1) + + transform!(img_bool, output, v, z) + img_test = test_transform(img) .^ 2 + + @test Array(output) ≈ img_test + end + + # non-threaded + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n, n) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = ones(Int32, size(img)) + z = ones(Float32, size(img) .+ 1) + + transform!(img_bool, output, v, z; threaded=false) + img_test = test_transform(img) .^ 2 + + @test Array(output) ≈ img_test + end + end + end + + @testset "3D transform!" begin + for n in [10, 100] + for test_idx in 1:5 + img = rand([0.0f0, 1.0f0], n, n, n) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = ones(Int32, size(img)) + z = ones(Float32, size(img) .+ 1) + + transform!(img_bool, output, v, z) + img_test = test_transform(img) .^ 2 + + @test Array(output) ≈ img_test + end + + # non-threaded + for test_idx in 1:5 + img = rand([0.0f0, 1.0f0], n, n, n) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = ones(Int32, size(img)) + z = ones(Float32, size(img) .+ 1) + + transform!(img_bool, output, v, z; threaded=false) + img_test = test_transform(img) .^ 2 + + @test Array(output) ≈ img_test + end + end + end + + if dev != Array + @testset "2D GPU transform!" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = dev(rand([0.0f0, 1.0f0], n, n)) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = dev(ones(Int32, size(img))) + z = dev(ones(Float32, size(img) .+ 1)) + + transform!(img_bool, output, v, z) + img_test = test_transform(Array(img)) .^ 2 + + @test Array(output) ≈ img_test + end + end + end + @testset "3D GPU transform!" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = dev(rand([0.0f0, 1.0f0], n, n, n)) + + img_bool = boolean_indicator(img) + output = similar(img, Float32) + v = dev(ones(Int32, size(img))) + z = dev(ones(Float32, size(img) .+ 1)) + + transform!(img_bool, output, v, z) + img_test = test_transform(Array(img)) .^ 2 + + @test Array(output) ≈ img_test + end + end + end + else + @info "No GPU available, skipping tests" + end end @testset "transform" begin - @testset "1D transform" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0.0f0, 1.0f0], n) - img_bool = boolean_indicator(img) - output = transform(img_bool) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - end - end - - @testset "2D transform" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0.0f0, 1.0f0], n, n) - img_bool = boolean_indicator(img) - output = transform(img_bool) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - - # non-threaded - for test_idx in 1:20 - img = rand([0.0f0, 1.0f0], n, n) - img_bool = boolean_indicator(img) - output = transform(img_bool; threaded = false) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - end - end - - @testset "3D transform" begin - for n in [10, 100] - for test_idx in 1:5 - img = rand([0.0f0, 1.0f0], n, n, n) - img_bool = boolean_indicator(img) - output = transform(img_bool) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - - # non-threaded - for test_idx in 1:5 - img = rand([0.0f0, 1.0f0], n, n, n) - img_bool = boolean_indicator(img) - output = transform(img_bool; threaded = false) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - end - end - - if dev != Array - @testset "2D GPU transform" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0.0f0, 1.0f0], n, n) - img_gpu = dev(img) - output = transform(img_gpu) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - end - end - - @testset "3D GPU transform" begin - for n in [10, 50, 100] - for test_idx in 1:20 - img = rand([0.0f0, 1.0f0], n, n, n) - img_gpu = dev(img) - output = transform(img_gpu) - img_test = test_transform(img) .^ 2 - @test Array(output) ≈ img_test - end - end - end - else - @info "No GPU available, skipping tests" - end + @testset "1D transform" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n) + img_bool = boolean_indicator(img) + output = transform(img_bool) + img_test = test_transform(img) .^ 2 + @test Array(output) ≈ img_test + end + end + end + + @testset "2D transform" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n, n) + img_bool = boolean_indicator(img) + output = transform(img_bool) + img_test = test_transform(img) .^ 2 + @test Array(output) ≈ img_test + end + + # non-threaded + for test_idx in 1:20 + img = rand([0.0f0, 1.0f0], n, n) + img_bool = boolean_indicator(img) + output = transform(img_bool; threaded=false) + img_test = test_transform(img) .^ 2 + @test Array(output) ≈ img_test + end + end + end + + @testset "3D transform" begin + for n in [10, 100] + for test_idx in 1:5 + img = rand([0.0f0, 1.0f0], n, n, n) + img_bool = boolean_indicator(img) + output = transform(img_bool) + img_test = test_transform(img) .^ 2 + @test Array(output) ≈ img_test + end + + # non-threaded + for test_idx in 1:5 + img = rand([0.0f0, 1.0f0], n, n, n) + img_bool = boolean_indicator(img) + output = transform(img_bool; threaded=false) + img_test = test_transform(img) .^ 2 + @test Array(output) ≈ img_test + end + end + end + + if dev != Array + @testset "2D GPU transform" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = dev(rand([0.0f0, 1.0f0], n, n)) + img_bool = boolean_indicator(img) + output = transform(img_bool) + img_test = test_transform(Array(img)) .^ 2 + @test Array(output) ≈ img_test + end + end + end + + @testset "3D GPU transform" begin + for n in [10, 50, 100] + for test_idx in 1:20 + img = dev(rand([0.0f0, 1.0f0], n, n, n)) + img_bool = boolean_indicator(img) + output = transform(img_bool) + img_test = test_transform(Array(img)) .^ 2 + @test Array(output) ≈ img_test + end + end + end + else + @info "No GPU available, skipping tests" + end end