Skip to content

Commit

Permalink
update to faster gpu kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
Dale-Black committed Nov 11, 2023
1 parent f44e0d9 commit b29c388
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 88 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ImageMorphology = "787d08f9-d448-5407-9aad-5290dd7ab264"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
# oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["AMDGPU", "CUDA", "ImageMorphology", "Metal", "oneAPI", "Random", "Test"]
test = ["AMDGPU", "CUDA", "ImageMorphology", "Metal", "Random", "Test"]
134 changes: 97 additions & 37 deletions src/transform.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ In-place squared Euclidean distance transforms. These functions apply the transf
The intermediate arrays `v` and `z` (and `temp` for 3D arrays) are used for computation. The `threaded` parameter enables parallel computation on the CPU.
# Arguments
#### Arguments
- `f`: Input vector, matrix, or 3D array.
- `output`: Preallocated array to store the result.
- `v`: Preallocated array for indices, matching the dimensions of `f`.
- `z`: Preallocated array for intermediate values, one element larger than `f`.
- `temp`: Preallocated array for intermediate values when transforming 3D arrays, matching the dimensions of `output`.
# Examples
#### Examples
```julia
f = rand([0f0, 1f0], 10)
f_bool = boolean_indicator(f)
Expand Down Expand Up @@ -62,12 +62,14 @@ function transform!(f::AbstractVector, output, v, z)
end
end

# 2D
function transform!(img::AbstractMatrix, output, v, z; threaded=true)
if threaded
Threads.@threads for i in CartesianIndices(@view(img[:, 1]))
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

copyto!(img, output)
Threads.@threads for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
Expand All @@ -78,6 +80,7 @@ function transform!(img::AbstractMatrix, output, v, z; threaded=true)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

copyto!(img, output)
for j in CartesianIndices(@view(img[1, :]))
@views transform!(
output[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
Expand All @@ -86,6 +89,81 @@ function transform!(img::AbstractMatrix, output, v, z; threaded=true)
end
end

@kernel function first_pass_kernel!(f, out, s2)
row, col = @index(Global, NTuple)

if f[row, col] < 0.5f0
ct = 1
curr_l = min(col-1, s2-col)
finished = false
while !finished && ct <= curr_l
if f[row, col-ct] > 0.5f0 || f[row, col+ct] > 0.5f0
out[row, col] = ct*ct
finished = true
end
ct += 1
end
while !finished && ct < col
if f[row, col-ct] > 0.5f0
out[row, col] = ct*ct
finished = true
end
ct += 1
end
while !finished && col+ct <= s2
if f[row, col+ct] > 0.5f0
out[row, col] = ct*ct
finished = true
end
ct += 1
end
if !finished
out[row, col] = 1f10
end
else
out[row, col] = 0f0
end
end

@kernel function second_pass_kernel!(org, out, s1, s2)
row, col = @index(Global, NTuple)

ct = 1
curr_l = sqrt(out[row, col])
while ct < curr_l && row+ct <= s1
temp = muladd(ct, ct, org[row+ct, col])
if temp < out[row, col]
out[row, col] = temp
curr_l = sqrt(temp)
end
ct += 1
end

ct = 1
while ct < curr_l && row > ct
temp = muladd(ct, ct, org[row-ct, col])
if temp < out[row, col]
out[row, col] = temp
curr_l = sqrt(temp)
end
ct += 1
end
end

function transform!(img::AbstractGPUMatrix, output, v, z)
s1, s2 = size(img)
backend = get_backend(img)
kernel1! = first_pass_kernel!(backend)
kernel2! = second_pass_kernel!(backend)

kernel1!(img, output, s2, ndrange = (s1, s2))
copyto!(img, output)
kernel2!(img, output, s1, s2, ndrange = (s1, s2))

KernelAbstractions.synchronize(backend)
end

# 3D
function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=true)
if threaded
Threads.@threads for i in CartesianIndices(@view(vol[:, :, 1]))
Expand All @@ -94,12 +172,14 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
)
end

copyto!(vol, output)
Threads.@threads for j in CartesianIndices(@view(vol[1, :, :]))
@views transform!(
output[:, j], temp[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end

copyto!(output, temp)
Threads.@threads for k in CartesianIndices(@view(vol[:, 1, :]))
@views transform!(
temp[k[1], :, k[2]], output[k[1], :, k[2]], fill!(v[k[1], :, k[2]], 1), fill!(z[k[1], :, k[2]], 1)
Expand All @@ -112,12 +192,14 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
)
end

copyto!(vol, output)
for j in CartesianIndices(@view(vol[1, :, :]))
@views transform!(
output[:, j], temp[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1)
)
end

copyto!(output, temp)
for k in CartesianIndices(@view(vol[:, 1, :]))
@views transform!(
temp[k[1], :, k[2]], output[k[1], :, k[2]], fill!(v[k[1], :, k[2]], 1), fill!(z[k[1], :, k[2]], 1)
Expand All @@ -126,29 +208,6 @@ function transform!(vol::AbstractArray{<:Real,3}, output, v, z, temp; threaded=t
end
end

@kernel function transform_kernel_cols!(img, output, v, z)
i = @index(Global)
@views transform!(img[i, :], output[i, :], v[i, :], z[i, :])
end

@kernel function transform_kernel_rows!(img, output, v, z)
j = @index(Global)
@views transform!(img[:, j], output[:, j], fill!(v[:, j], 1), fill!(z[:, j], 1))
end

function transform!(img::AbstractGPUMatrix, output, v, z)
backend = get_backend(img)
kernel_cols = transform_kernel_cols!(backend)
kernel_cols(img, output, v, z, ndrange=size(img, 1))

B = similar(output)
copyto!(B, output)

kernel_rows = transform_kernel_rows!(backend)
kernel_rows(B, output, v, z, ndrange=size(img, 2))
KernelAbstractions.synchronize(backend)
end

export transform!

"""
Expand All @@ -170,10 +229,10 @@ Non-in-place squared Euclidean distance transforms that return a new array with
The `threaded` parameter can be used to enable or disable parallel computation on the CPU.
# Arguments
#### Arguments
- `f/img/vol`: Input vector, matrix, or 3D array to be transformed.
# Examples
#### Examples
```julia
f = rand([0f0, 1f0], 10)
f_bool = boolean_indicator(f)
Expand All @@ -189,6 +248,7 @@ function transform(f::AbstractVector)
return output
end

# 2D
function transform(img::AbstractMatrix; threaded=true)
output = similar(img, eltype(img))
v = ones(Int32, size(img))
Expand All @@ -198,6 +258,16 @@ function transform(img::AbstractMatrix; threaded=true)
return output
end

function transform(img::AbstractGPUMatrix)
backend = KernelAbstractions.get_backend(img)
output = similar(img, Float32)
v = KernelAbstractions.ones(backend, Int32, size(img))
z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)

transform!(img, output, v, z)
return output
end

function transform(vol::AbstractArray{<:Real,3}; threaded=true)
output = similar(vol, eltype(vol))
v = ones(Int32, size(vol))
Expand All @@ -208,14 +278,4 @@ function transform(vol::AbstractArray{<:Real,3}; threaded=true)
return output
end

function transform(img::AbstractGPUMatrix)
backend = KernelAbstractions.get_backend(img)
output = similar(img, Float32)
v = KernelAbstractions.ones(backend, Int32, size(img))
z = KernelAbstractions.ones(backend, Float32, size(img) .+ 1)
transform!(img, output, v, z)

return output
end

export transform
43 changes: 37 additions & 6 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,48 @@ using LoopVectorization

"""
## `boolean_indicator`
```julia
boolean_indicator(f)
boolean_indicator(f::AbstractArray)
boolean_indicator(f::AbstractGPUArray)
boolean_indicator(f::BitArray)
```
If `f` is a boolean indicator where 0's correspond to background and 1s correspond to the foreground, then mark background pixels with a large number `1e10`.
Uses LoopVectorization.jl for speed up if array is `BitArray`
Create a float representation of a boolean indicator array where `0` represents the background and `1` represents the foreground.
This function converts a logical array into a floating-point array where foreground values (logical `1`) are marked as `0.0f0` (float representation of `0`), and background values (logical `0`) are marked with a large float number `1.0f10`. This representation is useful in distance transform operations to differentiate between the foreground and the background.
#### Arguments
- `f`: An array of boolean values or an `AbstractGPUArray` of boolean values, where `true` indicates the foreground and `false` indicates the background.
#### Returns
- A floating-point array of the same dimensions as `f`, with foreground values set to `0.0f0` and background values set to `1.0f10`.
#### Performance
- If `f` is a `BitArray`, the conversion uses LoopVectorization.jl for a potential speedup. The warning check arguments are disabled for performance reasons.
- If `f` is an `AbstractGPUArray`, the computation is offloaded to the GPU using a custom kernel, `boolean_indicator_kernel`, which is expected to yield a significant speedup on compatible hardware.
#### Examples
```julia
f = BitArray([true, false, true, false])
f_float = boolean_indicator(f)
# f_float will be Float32[0.0f0, 1.0f10, 0.0f0, 1.0f10]
f_gpu = CUDA.zeros(Bool, 10) # assuming CUDA.jl is used for GPU arrays
f_gpu[5] = true
f_float_gpu = boolean_indicator(f_gpu)
# f_float_gpu will be a GPU array with the fifth element as 0.0f0 and others as 1.0f10
```
#### Notes
- The choice of `1.0f10` as the large number is arbitrary and can be adjusted if needed for specific applications.
- The `@turbo` macro from LoopVectorization.jl is used when `f` is a `BitArray` to unroll and vectorize the loop for optimal performance.
- When `f` is an `AbstractGPUArray`, the `boolean_indicator_kernel` kernel function is used to perform the operation in parallel on the GPU.
- The `KernelAbstractions.synchronize(backend)` call ensures that all GPU operations are completed before returning the result.
"""
boolean_indicator(f) = @. ifelse(f < 0.5, 1.0f10, 0.0f0)
function boolean_indicator(f)
return @. ifelse(f == 0, 1.0f10, 0.0f0)
end

function boolean_indicator(f::BitArray)
f_new = similar(f, Float32)
Expand All @@ -37,5 +70,3 @@ function boolean_indicator(f::AbstractGPUArray)
end

export boolean_indicator


32 changes: 32 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
using DistanceTransforms
using Test

using KernelAbstractions
using CUDA, AMDGPU, Metal
# using oneAPI
using Random

if CUDA.functional()
@info "Using CUDA"
CUDA.versioninfo()
backend = CUDABackend()
dev = CuArray
elseif AMDGPU.functional()
@info "Using AMD"
AMDGPU.versioninfo()
backend = ROCBackend()
dev = ROCArray
# elseif oneAPI.functional() ## not well supported
# @info "Using oneAPI"
# oneAPI.versioninfo()
# backend = oneBackend()
# dev = oneArray
elseif Metal.functional()
@info "Using Metal"
Metal.versioninfo()
backend = MetalBackend()
dev = MtlArray
else
@info "No GPU is available. Using CPU."
backend = CPU()
dev = Array
end

include("transform.jl")
include("utils.jl")
Loading

0 comments on commit b29c388

Please sign in to comment.