Skip to content

Commit

Permalink
Merge pull request #188 from PhilipFackler/kw-struct
Browse files Browse the repository at this point in the history
Add parallel_for API with keyword struct
  • Loading branch information
williamfgc authored Feb 14, 2025
2 parents 48383bb + ef6b464 commit 43ef00f
Show file tree
Hide file tree
Showing 8 changed files with 454 additions and 106 deletions.
122 changes: 112 additions & 10 deletions ext/JACCAMDGPU/JACCAMDGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,42 @@ using .Experimental

JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend()

function JACC.parallel_for(
::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
default_stream() = AMDGPU.stream()

JACC.default_stream(::Type{AMDGPUBackend}) = default_stream()

function JACC.parallel_for(::AMDGPUBackend, N::Integer, f::Function, x...)
kernel = @roc launch=false _parallel_for_amdgpu(N, f, x...)
config = AMDGPU.launch_configuration(kernel)
threads = min(N, config.groupsize)
blocks = cld(N, threads)
shmem_size = 2 * threads * sizeof(Float64)
kernel(N, f, x...; groupsize=threads, gridsize=blocks, shmem=shmem_size)
kernel(
N, f, x...; groupsize = threads, gridsize = blocks, shmem = shmem_size)
AMDGPU.synchronize()
end

function JACC.parallel_for(
spec::LaunchSpec{AMDGPUBackend}, N::Integer, f::Function, x...)
kernel = @roc launch=false _parallel_for_amdgpu(N, f, x...)
if spec.threads == 0
config = AMDGPU.launch_configuration(kernel)
spec.threads = min(N, config.groupsize)
end
if spec.blocks == 0
spec.blocks = cld(N, spec.threads)
end
if spec.shmem_size == nothing
spec.shmem_size = 2 * spec.threads * sizeof(Float64)
end
kernel(
N, f, x...; groupsize = spec.threads, gridsize = spec.blocks,
shmem = spec.shmem_size, stream = spec.stream)
if spec.sync
AMDGPU.synchronize(spec.stream)
end
end

abstract type BlockIndexer2D end

struct BlockIndexerBasic <: BlockIndexer2D end
Expand All @@ -46,8 +71,7 @@ function (blkIter::BlockIndexerSwapped)(blockIdx, blockDim, threadIdx)
end

function JACC.parallel_for(
::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {
I <: Integer, F <: Function}
::AMDGPUBackend, (M, N)::NTuple{2, Integer}, f::Function, x...)
dev = AMDGPU.device()
props = AMDGPU.HIP.properties(dev)
maxBlocks = (x = props.maxGridSize[1], y = props.maxGridSize[2])
Expand All @@ -64,7 +88,7 @@ function JACC.parallel_for(
blockAttrs = (
max_x = props.maxThreadsDim[1],
max_y = props.maxThreadsDim[2],
total = props.maxThreadsPerBlock,
total = props.maxThreadsPerBlock
)
x_thr = min(
blockAttrs.max_x,
Expand All @@ -81,14 +105,65 @@ function JACC.parallel_for(
blocks = (cld(m, x_thr), cld(n, y_thr))

shmem_size = 2 * x_thr * y_thr * sizeof(Float64)
kernel(indexer, (M, N), f, x...; groupsize=threads, gridsize=blocks, shmem=shmem_size)
kernel(indexer, (M, N), f, x...; groupsize = threads,
gridsize = blocks, shmem = shmem_size)
AMDGPU.synchronize()
end

function JACC.parallel_for(
::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F,
x...) where {
I <: Integer, F <: Function}
spec::LaunchSpec{AMDGPUBackend}, (M, N)::NTuple{2, Integer}, f::Function, x...)
dev = AMDGPU.device()
props = AMDGPU.HIP.properties(dev)
indexer = BlockIndexerBasic()
m, n = (M, N)

kernel = @roc launch=false _parallel_for_amdgpu_MN(indexer, (M, N), f, x...)
config = AMDGPU.launch_configuration(kernel)

if spec.threads == 0
maxBlocks = (x = props.maxGridSize[1], y = props.maxGridSize[2])
if M < N && maxBlocks.x > maxBlocks.y
indexer = BlockIndexerSwapped()
m, n = (N, M)
end
maxThreads = config.groupsize
blockAttrs = (
max_x = props.maxThreadsDim[1],
max_y = props.maxThreadsDim[2],
total = props.maxThreadsPerBlock
)
x_thr = min(
blockAttrs.max_x,
nextpow(2, m / blockAttrs.total + 1),
blockAttrs.total,
maxThreads
)
y_thr = min(
blockAttrs.max_y,
cld(blockAttrs.total, x_thr),
cld(maxThreads, x_thr)
)
spec.threads = (x_thr, y_thr)
end

if spec.blocks == 0
spec.blocks = (cld(m, spec.threads[1]), cld(n, spec.threads[2]))
end

if spec.shmem_size == 0
spec.shmem_size = 2 * spec.threads[1] * spec.threads[2] *
sizeof(Float64)
end

kernel(indexer, (M, N), f, x...; groupsize = spec.threads,
gridsize = spec.blocks, shmem = spec.shmem_size, stream = spec.stream)
if spec.sync
AMDGPU.synchronize(spec.stream)
end
end

function JACC.parallel_for(
::AMDGPUBackend, (L, M, N)::NTuple{3, Integer}, f::Function, x...)
numThreads = 32
Lthreads = min(L, numThreads)
Mthreads = min(M, numThreads)
Expand All @@ -105,6 +180,33 @@ function JACC.parallel_for(
AMDGPU.synchronize()
end

function JACC.parallel_for(
spec::LaunchSpec{AMDGPUBackend}, (L, M, N)::NTuple{3, Integer}, f::Function,
x...)
if spec.threads == 0
numThreads = 32
Lthreads = min(L, numThreads)
Mthreads = min(M, numThreads)
Nthreads = 1
spec.threads = (Lthreads, Mthreads, Nthreads)
end
if spec.blocks == 0
Lblocks = ceil(Int, L / spec.threads[1])
Mblocks = ceil(Int, M / spec.threads[2])
Nblocks = ceil(Int, N / spec.threads[3])
spec.blocks = (Lblocks, Mblocks, Nblocks)
end
if spec.shmem_size == 0
spec.shmem_size = 2 * spec.threads[1] * spec.threads[2] *
spec.threads[3] * sizeof(Float64)
end
@roc groupsize=spec.threads gridsize=spec.blocks shmem=spec.shmem_size stream=spec.stream _parallel_for_amdgpu_LMN(
(L, M, N), f, x...)
if spec.sync
AMDGPU.synchronize(spec.stream)
end
end

function JACC.parallel_reduce(
::AMDGPUBackend, N::Integer, op, f::Function, x...; init)
numThreads = 512
Expand Down
135 changes: 121 additions & 14 deletions ext/JACCCUDA/JACCCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ using .Experimental

JACC.get_backend(::Val{:cuda}) = CUDABackend()

default_stream() = CUDA.stream()

JACC.default_stream(::Type{CUDABackend}) = default_stream()

function JACC.synchronize(::CUDABackend; stream = default_stream())
CUDA.synchronize(stream)
end

@inline kernel_args(args...) = cudaconvert.((args))

@inline function kernel_maxthreads(kernel_function, kargs)
Expand All @@ -23,15 +31,36 @@ JACC.get_backend(::Val{:cuda}) = CUDABackend()
return (p_kernel, CUDA.maxthreads(p_kernel))
end

function JACC.parallel_for(
::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
function JACC.parallel_for(::CUDABackend, N::Integer, f::Function, x...)
kargs = kernel_args(N, f, x...)
kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda, kargs)
threads = min(N, maxThreads)
blocks = ceil(Int, N / threads)
shmem_size = attribute(
device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
kernel(kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
CUDA.@sync kernel(
kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
end

function JACC.parallel_for(
spec::LaunchSpec{CUDABackend}, N::Integer, f::Function, x...)
kargs = kernel_args(N, f, x...)
kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda, kargs)
if spec.threads == 0
spec.threads = min(N, maxThreads)
end
if spec.blocks == 0
spec.blocks = ceil(Int, N / spec.threads)
end
if spec.shmem_size == 0
spec.shmem_size = attribute(
device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
end
kernel(kargs...; threads = spec.threads, blocks = spec.blocks,
shmem = spec.shmem_size, stream = spec.stream)
if spec.sync
CUDA.synchronize(spec.stream)
end
end

abstract type BlockIndexer2D end
Expand All @@ -53,15 +82,14 @@ function (blkIter::BlockIndexerSwapped)(blockIdx, blockDim, threadIdx)
end

function JACC.parallel_for(
::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {
I <: Integer, F <: Function}
::CUDABackend, (M, N)::NTuple{2, Integer}, f::Function, x...)
#To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
# potential benefit from using shared memory.

dev = CUDA.device()
maxBlocks = (
x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X),
y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y),
y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
)
indexer = BlockIndexerBasic()
m, n = (M, N)
Expand All @@ -85,20 +113,72 @@ function JACC.parallel_for(
y_thr = min(
blockAttrs.max_y,
ceil(Int, blockAttrs.total / x_thr),
ceil(Int, maxThreads / x_thr),
ceil(Int, maxThreads / x_thr)
)
threads = (x_thr, y_thr)
blocks = (cld(m, x_thr), cld(n, y_thr))

shmem_size = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
shmem_size = attribute(
dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)

kernel(kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
CUDA.@sync kernel(
kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
end

function JACC.parallel_for(
::CUDABackend, (L, M, N)::Tuple{I, I, I}, f::F,
x...) where {
I <: Integer, F <: Function}
spec::LaunchSpec{CUDABackend}, (M, N)::NTuple{2, Integer}, f::Function, x...)
dev = CUDA.device()
indexer = BlockIndexerBasic()
m, n = (M, N)

kargs = kernel_args(indexer, (M, N), f, x...)
kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda_MN, kargs)

if spec.threads == 0
maxBlocks = (
x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X),
y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
)
if M < N && maxBlocks.x > maxBlocks.y
indexer = BlockIndexerSwapped()
m, n = (N, M)
end
blockAttrs = (
max_x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
max_y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
total = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
x_thr = min(
blockAttrs.max_x,
nextpow(2, m / blockAttrs.total + 1),
blockAttrs.total,
maxThreads
)
y_thr = min(
blockAttrs.max_y,
ceil(Int, blockAttrs.total / x_thr),
ceil(Int, maxThreads / x_thr)
)
spec.threads = (x_thr, y_thr)
end

if spec.blocks == 0
spec.blocks = (cld(m, spec.threads[1]), cld(n, spec.threads[2]))
end

if spec.shmem_size == 0
spec.shmem_size = attribute(
dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
end

kernel(kargs...; threads = spec.threads, blocks = spec.blocks,
shmem = spec.shmem_size, stream = spec.stream)
if spec.sync
CUDA.synchronize(spec.stream)
end
end

function JACC.parallel_for(
::CUDABackend, (L, M, N)::NTuple{3, Integer}, f::Function, x...)
#To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
# potential benefit from using shared memory.
numThreads = 32
Expand All @@ -115,6 +195,33 @@ function JACC.parallel_for(
(L, M, N), f, x...)
end

function JACC.parallel_for(
spec::LaunchSpec{CUDABackend}, (L, M, N)::NTuple{3, Integer}, f::Function,
x...)
if spec.threads == 0
numThreads = 32
Lthreads = min(L, numThreads)
Mthreads = min(M, numThreads)
Nthreads = 1
spec.threads = (Lthreads, Mthreads, Nthreads)
end
if spec.blocks == 0
Lblocks = ceil(Int, L / spec.threads[1])
Mblocks = ceil(Int, M / spec.threads[2])
Nblocks = ceil(Int, N / spec.threads[3])
spec.blocks = (Lblocks, Mblocks, Nblocks)
end
if spec.shmem_size == 0
spec.shmem_size = attribute(
device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
end
@cuda threads=spec.threads blocks=spec.blocks shmem=spec.shmem_size stream=spec.stream _parallel_for_cuda_LMN(
(L, M, N), f, x...)
if spec.sync
CUDA.synchronize(spec.stream)
end
end

function JACC.parallel_reduce(
::CUDABackend, N::Integer, op, f::Function, x...; init)
ret_inst = CUDA.CuArray{typeof(init)}(undef, 0)
Expand Down Expand Up @@ -249,7 +356,7 @@ function reduce_kernel_cuda(N, op, red, ret)
end

function _parallel_reduce_cuda_MN((M, N), op, ret, f, x...)
shared_mem = CuDynamicSharedArray(eltype(ret), 16*16)
shared_mem = CuDynamicSharedArray(eltype(ret), 16 * 16)
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
ti = threadIdx().x
Expand Down Expand Up @@ -300,7 +407,7 @@ function _parallel_reduce_cuda_MN((M, N), op, ret, f, x...)
end

function reduce_kernel_cuda_MN((M, N), op, red, ret)
shared_mem = CuDynamicSharedArray(eltype(ret), 16*16)
shared_mem = CuDynamicSharedArray(eltype(ret), 16 * 16)
i = threadIdx().x
j = threadIdx().y
ii = i
Expand Down
Loading

0 comments on commit 43ef00f

Please sign in to comment.