update to faster gpu kernel

MolloiLab · Nov 11, 2023 · b29c388 · b29c388
1 parent f44e0d9
commit b29c388
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 88 deletions.
diff --git a/Project.toml b/Project.toml
@@ -16,9 +16,9 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ImageMorphology = "787d08f9-d448-5407-9aad-5290dd7ab264"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
-oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
+# oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["AMDGPU", "CUDA", "ImageMorphology", "Metal", "oneAPI", "Random", "Test"]
+test = ["AMDGPU", "CUDA", "ImageMorphology", "Metal", "Random", "Test"]
diff --git a/src/transform.jl b/src/transform.jl
@@ -19,14 +19,14 @@ In-place squared Euclidean distance transforms. These functions apply the transf
 
 The intermediate arrays `v` and `z` (and `temp` for 3D arrays) are used for computation. The `threaded` parameter enables parallel computation on the CPU.
 
-# Arguments
+#### Arguments
 - `f`: Input vector, matrix, or 3D array.
 - `output`: Preallocated array to store the result.
 - `v`: Preallocated array for indices, matching the dimensions of `f`.
 - `z`: Preallocated array for intermediate values, one element larger than `f`.
 - `temp`: Preallocated array for intermediate values when transforming 3D arrays, matching the dimensions of `output`.
 
-# Examples
+#### Examples
 ```julia
 f = rand([0f0, 1f0], 10)
 f_bool = boolean_indicator(f)
@@ -62,12 +62,14 @@ function transform!(f::AbstractVector, output, v, z)
     end
 end
 
+# 2D
 function transform!(img::AbstractMatrix, output, v, z; threaded=true)
     if threaded
         Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
             @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
         end
 
+        copyto!(img, output)
         Threads.@threads for j in CartesianIndices(@view(img[1, :]))
             @views transform!(
                 output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
@@ -78,6 +80,7 @@ function transform!(img::AbstractMatrix, output, v, z; threaded=true)
             @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
         end
 
+        copyto!(img, output)
         for j in CartesianIndices(@view(img[1, :]))
             @views transform!(
                 output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
@@ -86,6 +89,81 @@ function transform!(img::AbstractMatrix, output, v, z; threaded=true)
     end
 end
 
+@kernel function first_pass_kernel!(f, out, s2)
+    row, col = @index(Global, NTuple)
+
+    if f[row, col] < 0.5f0
+        ct = 1
+        curr_l = min(col-1, s2-col)
+        finished = false
+        while !finished && ct <= curr_l
+            if f[row, col-ct] > 0.5f0 || f[row, col+ct] > 0.5f0
+                out[row, col] = ct*ct
+                finished = true
+            end
+            ct += 1
+        end
+        while !finished && ct < col
+            if f[row, col-ct] > 0.5f0
+                out[row, col] = ct*ct
+                finished = true
+            end
+            ct += 1
+        end
+        while !finished && col+ct <= s2
+            if f[row, col+ct] > 0.5f0
+                out[row, col] = ct*ct
+                finished = true
+            end
+            ct += 1
+        end
+        if !finished
+            out[row, col] = 1f10
+        end
+    else
+        out[row, col] = 0f0
+    end
+end
+
+@kernel function second_pass_kernel!(org, out, s1, s2)
+    row, col = @index(Global, NTuple)
+
+    ct = 1
+    curr_l = sqrt(out[row, col])
+    while ct < curr_l && row+ct <= s1
+        temp = muladd(ct, ct, org[row+ct, col])
+        if temp < out[row, col]
+            out[row, col] = temp
+            curr_l = sqrt(temp)
+        end
+        ct += 1
+    end
+
+    ct = 1
+    while ct < curr_l && row > ct
+        temp = muladd(ct, ct, org[row-ct, col])
+        if temp < out[row, col]
+            out[row, col] = temp
+            curr_l = sqrt(temp)
+        end
+        ct += 1
+    end
+end
+
+function transform!(img::AbstractGPUMatrix, output, v, z)
+    s1, s2 = size(img)
+    backend = get_backend(img)
+	kernel1! = first_pass_kernel!(backend)
+	kernel2! = second_pass_kernel!(backend)
+
+    kernel1!(img, output, s2, ndrange = (s1, s2))
+    copyto!(img, output)
+    kernel2!(img, output, s1, s2, ndrange = (s1, s2))
+
+    KernelAbstractions.synchronize(backend)
+end
+
+# 3D
 function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=true)
     if threaded
         Threads.@threads for i in CartesianIndices(@view(vol[:, :, 1]))
@@ -94,12 +172,14 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
             )
         end
 
+        copyto!(vol, output)
         Threads.@threads for j in CartesianIndices(@view(vol[1, :, :]))
             @views transform!(
                 output[:, j], temp[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
             )
         end
 
+        copyto!(output, temp)
         Threads.@threads for k in CartesianIndices(@view(vol[:, 1, :]))
             @views transform!(
                 temp[k[1], :, k[2]], output[k[1], :, k[2]], fill!(v[k[1], :, k[2]], 1), fill!(z[k[1], :, k[2]], 1)
@@ -112,12 +192,14 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
             )
         end
 
+        copyto!(vol, output)
         for j in CartesianIndices(@view(vol[1, :, :]))
             @views transform!(
                 output[:, j], temp[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
             )
         end
 
+        copyto!(output, temp)
         for k in CartesianIndices(@view(vol[:, 1, :]))
             @views transform!(
                 temp[k[1], :, k[2]], output[k[1], :, k[2]], fill!(v[k[1], :, k[2]], 1), fill!(z[k[1], :, k[2]], 1)
@@ -126,29 +208,6 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
     end
 end
 
-@kernel function transform_kernel_cols!(img, output, v, z)
-    i = @index(Global)
-    @views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
-end
-
-@kernel function transform_kernel_rows!(img, output, v, z)
-    j = @index(Global)
-    @views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
-end
-
-function transform!(img::AbstractGPUMatrix, output, v, z)
-    backend = get_backend(img)
-    kernel_cols = transform_kernel_cols!(backend)
-    kernel_cols(img, output, v, z, ndrange=size(img, 1))
-
-    B = similar(output)
-    copyto!(B, output)
-
-    kernel_rows = transform_kernel_rows!(backend)
-    kernel_rows(B, output, v, z, ndrange=size(img, 2))
-    KernelAbstractions.synchronize(backend)
-end
-
 export transform!
 
 """
@@ -170,10 +229,10 @@ Non-in-place squared Euclidean distance transforms that return a new array with
 
 The `threaded` parameter can be used to enable or disable parallel computation on the CPU.
 
-# Arguments
+#### Arguments
 - `f/img/vol`: Input vector, matrix, or 3D array to be transformed.
 
-# Examples
+#### Examples
 ```julia
 f = rand([0f0, 1f0], 10)
 f_bool = boolean_indicator(f)
@@ -189,6 +248,7 @@ function transform(f::AbstractVector)
     return output
 end
 
+# 2D
 function transform(img::AbstractMatrix; threaded=true)
     output = similar(img, eltype(img))
     v = ones(Int32, size(img))
@@ -198,6 +258,16 @@ function transform(img::AbstractMatrix; threaded=true)
     return output
 end
 
+function transform(img::AbstractGPUMatrix)
+	backend = KernelAbstractions.get_backend(img)
+	output = similar(img, Float32)
+	v = KernelAbstractions.ones(backend, Int32, size(img))
+	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
+
+	transform!(img, output, v, z)
+	return output
+end
+
 function transform(vol::AbstractArray{<:Real,3}; threaded=true)
     output = similar(vol, eltype(vol))
     v = ones(Int32, size(vol))
@@ -208,14 +278,4 @@ function transform(vol::AbstractArray{<:Real,3}; threaded=true)
     return output
 end
 
-function transform(img::AbstractGPUMatrix)
-	backend = KernelAbstractions.get_backend(img)
-	output = similar(img, Float32)
-	v = KernelAbstractions.ones(backend, Int32, size(img))
-	z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
-	transform!(img, output, v, z)
-
-	return output
-end
-
 export transform
diff --git a/src/utils.jl b/src/utils.jl
@@ -4,15 +4,48 @@ using LoopVectorization
 
 """
 ## `boolean_indicator`
+    
 ```julia
-boolean_indicator(f)
+boolean_indicator(f::AbstractArray)
+boolean_indicator(f::AbstractGPUArray)
 boolean_indicator(f::BitArray)
 ```
 
-If `f` is a boolean indicator where 0's correspond to background and 1s correspond to the foreground, then mark background pixels with a large number `1e10`.
-Uses LoopVectorization.jl for speed up if array is `BitArray`
+Create a float representation of a boolean indicator array where `0` represents the background and `1` represents the foreground.
+This function converts a logical array into a floating-point array where foreground values (logical `1`) are marked as `0.0f0` (float representation of `0`), and background values (logical `0`) are marked with a large float number `1.0f10`. This representation is useful in distance transform operations to differentiate between the foreground and the background.
+
+#### Arguments
+- `f`: An array of boolean values or an `AbstractGPUArray` of boolean values, where `true` indicates the foreground and `false` indicates the background.
+
+#### Returns
+- A floating-point array of the same dimensions as `f`, with foreground values set to `0.0f0` and background values set to `1.0f10`.
+
+#### Performance
+- If `f` is a `BitArray`, the conversion uses LoopVectorization.jl for a potential speedup. The warning check arguments are disabled for performance reasons.
+- If `f` is an `AbstractGPUArray`, the computation is offloaded to the GPU using a custom kernel, `boolean_indicator_kernel`, which is expected to yield a significant speedup on compatible hardware.
+
+#### Examples
+```julia
+f = BitArray([true, false, true, false])
+f_float = boolean_indicator(f)
+# f_float will be Float32[0.0f0, 1.0f10, 0.0f0, 1.0f10]
+
+f_gpu = CUDA.zeros(Bool, 10) # assuming CUDA.jl is used for GPU arrays
+f_gpu[5] = true
+f_float_gpu = boolean_indicator(f_gpu)
+# f_float_gpu will be a GPU array with the fifth element as 0.0f0 and others as 1.0f10
+```
+
+#### Notes
+- The choice of `1.0f10` as the large number is arbitrary and can be adjusted if needed for specific applications.
+- The `@turbo` macro from LoopVectorization.jl is used when `f` is a `BitArray` to unroll and vectorize the loop for optimal performance.
+- When `f` is an `AbstractGPUArray`, the `boolean_indicator_kernel` kernel function is used to perform the operation in parallel on the GPU.
+- The `KernelAbstractions.synchronize(backend)` call ensures that all GPU operations are completed before returning the result.
+
 """
-boolean_indicator(f) = @. ifelse(f < 0.5, 1.0f10, 0.0f0)
+function boolean_indicator(f)
+    return @. ifelse(f == 0, 1.0f10, 0.0f0)
+end
 
 function boolean_indicator(f::BitArray)
     f_new = similar(f, Float32)
@@ -37,5 +70,3 @@ function boolean_indicator(f::AbstractGPUArray)
 end
 
 export boolean_indicator
-
-
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,36 @@
 using DistanceTransforms
 using Test
 
+using KernelAbstractions
+using CUDA, AMDGPU, Metal
+# using oneAPI
+using Random
+
+if CUDA.functional()
+	@info "Using CUDA"
+	CUDA.versioninfo()
+	backend = CUDABackend()
+	dev = CuArray
+elseif AMDGPU.functional()
+	@info "Using AMD"
+	AMDGPU.versioninfo()
+	backend = ROCBackend()
+	dev = ROCArray
+# elseif oneAPI.functional() ## not well supported
+# 	@info "Using oneAPI"
+# 	oneAPI.versioninfo()
+# 	backend = oneBackend()
+# 	dev = oneArray
+elseif Metal.functional()
+	@info "Using Metal"
+	Metal.versioninfo()
+	backend = MetalBackend()
+	dev = MtlArray
+else
+    @info "No GPU is available. Using CPU."
+	backend = CPU()
+	dev = Array
+end
+
 include("transform.jl")
+include("utils.jl")