Merge pull request #188 from PhilipFackler/kw-struct

Add parallel_for API with keyword struct
JuliaORNL · Feb 14, 2025 · 43ef00f · 43ef00f
2 parents 48383bb + ef6b464
commit 43ef00f
Show file tree

Hide file tree

Showing 8 changed files with 454 additions and 106 deletions.
diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -16,17 +16,42 @@ using .Experimental
 
 JACC.get_backend(::Val{:amdgpu}) = AMDGPUBackend()
 
-function JACC.parallel_for(
-        ::AMDGPUBackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
+default_stream() = AMDGPU.stream()
+
+JACC.default_stream(::Type{AMDGPUBackend}) = default_stream()
+
+function JACC.parallel_for(::AMDGPUBackend, N::Integer, f::Function, x...)
     kernel = @roc launch=false _parallel_for_amdgpu(N, f, x...)
     config = AMDGPU.launch_configuration(kernel)
     threads = min(N, config.groupsize)
     blocks = cld(N, threads)
     shmem_size = 2 * threads * sizeof(Float64)
-    kernel(N, f, x...; groupsize=threads, gridsize=blocks, shmem=shmem_size)
+    kernel(
+        N, f, x...; groupsize = threads, gridsize = blocks, shmem = shmem_size)
     AMDGPU.synchronize()
 end
 
+function JACC.parallel_for(
+        spec::LaunchSpec{AMDGPUBackend}, N::Integer, f::Function, x...)
+    kernel = @roc launch=false _parallel_for_amdgpu(N, f, x...)
+    if spec.threads == 0
+        config = AMDGPU.launch_configuration(kernel)
+        spec.threads = min(N, config.groupsize)
+    end
+    if spec.blocks == 0
+        spec.blocks = cld(N, spec.threads)
+    end
+    if spec.shmem_size == nothing
+        spec.shmem_size = 2 * spec.threads * sizeof(Float64)
+    end
+    kernel(
+        N, f, x...; groupsize = spec.threads, gridsize = spec.blocks,
+        shmem = spec.shmem_size, stream = spec.stream)
+    if spec.sync
+        AMDGPU.synchronize(spec.stream)
+    end
+end
+
 abstract type BlockIndexer2D end
 
 struct BlockIndexerBasic <: BlockIndexer2D end
@@ -46,8 +71,7 @@ function (blkIter::BlockIndexerSwapped)(blockIdx, blockDim, threadIdx)
 end
 
 function JACC.parallel_for(
-        ::AMDGPUBackend, (M, N)::Tuple{I, I}, f::F, x...) where {
-        I <: Integer, F <: Function}
+        ::AMDGPUBackend, (M, N)::NTuple{2, Integer}, f::Function, x...)
     dev = AMDGPU.device()
     props = AMDGPU.HIP.properties(dev)
     maxBlocks = (x = props.maxGridSize[1], y = props.maxGridSize[2])
@@ -64,7 +88,7 @@ function JACC.parallel_for(
     blockAttrs = (
         max_x = props.maxThreadsDim[1],
         max_y = props.maxThreadsDim[2],
-        total = props.maxThreadsPerBlock,
+        total = props.maxThreadsPerBlock
     )
     x_thr = min(
         blockAttrs.max_x,
@@ -81,14 +105,65 @@ function JACC.parallel_for(
     blocks = (cld(m, x_thr), cld(n, y_thr))
 
     shmem_size = 2 * x_thr * y_thr * sizeof(Float64)
-    kernel(indexer, (M, N), f, x...; groupsize=threads, gridsize=blocks, shmem=shmem_size)
+    kernel(indexer, (M, N), f, x...; groupsize = threads,
+        gridsize = blocks, shmem = shmem_size)
     AMDGPU.synchronize()
 end
 
 function JACC.parallel_for(
-        ::AMDGPUBackend, (L, M, N)::Tuple{I, I, I}, f::F,
-        x...) where {
-        I <: Integer, F <: Function}
+        spec::LaunchSpec{AMDGPUBackend}, (M, N)::NTuple{2, Integer}, f::Function, x...)
+    dev = AMDGPU.device()
+    props = AMDGPU.HIP.properties(dev)
+    indexer = BlockIndexerBasic()
+    m, n = (M, N)
+
+    kernel = @roc launch=false _parallel_for_amdgpu_MN(indexer, (M, N), f, x...)
+    config = AMDGPU.launch_configuration(kernel)
+
+    if spec.threads == 0
+        maxBlocks = (x = props.maxGridSize[1], y = props.maxGridSize[2])
+        if M < N && maxBlocks.x > maxBlocks.y
+            indexer = BlockIndexerSwapped()
+            m, n = (N, M)
+        end
+        maxThreads = config.groupsize
+        blockAttrs = (
+            max_x = props.maxThreadsDim[1],
+            max_y = props.maxThreadsDim[2],
+            total = props.maxThreadsPerBlock
+        )
+        x_thr = min(
+            blockAttrs.max_x,
+            nextpow(2, m / blockAttrs.total + 1),
+            blockAttrs.total,
+            maxThreads
+        )
+        y_thr = min(
+            blockAttrs.max_y,
+            cld(blockAttrs.total, x_thr),
+            cld(maxThreads, x_thr)
+        )
+        spec.threads = (x_thr, y_thr)
+    end
+
+    if spec.blocks == 0
+        spec.blocks = (cld(m, spec.threads[1]), cld(n, spec.threads[2]))
+    end
+
+    if spec.shmem_size == 0
+        spec.shmem_size = 2 * spec.threads[1] * spec.threads[2] *
+                          sizeof(Float64)
+    end
+
+    kernel(indexer, (M, N), f, x...; groupsize = spec.threads,
+        gridsize = spec.blocks, shmem = spec.shmem_size, stream = spec.stream)
+    if spec.sync
+        AMDGPU.synchronize(spec.stream)
+    end
+end
+
+function JACC.parallel_for(
+        ::AMDGPUBackend, (L, M, N)::NTuple{3, Integer}, f::Function, x...)
     numThreads = 32
     Lthreads = min(L, numThreads)
     Mthreads = min(M, numThreads)
@@ -105,6 +180,33 @@ function JACC.parallel_for(
     AMDGPU.synchronize()
 end
 
+function JACC.parallel_for(
+        spec::LaunchSpec{AMDGPUBackend}, (L, M, N)::NTuple{3, Integer}, f::Function,
+        x...)
+    if spec.threads == 0
+        numThreads = 32
+        Lthreads = min(L, numThreads)
+        Mthreads = min(M, numThreads)
+        Nthreads = 1
+        spec.threads = (Lthreads, Mthreads, Nthreads)
+    end
+    if spec.blocks == 0
+        Lblocks = ceil(Int, L / spec.threads[1])
+        Mblocks = ceil(Int, M / spec.threads[2])
+        Nblocks = ceil(Int, N / spec.threads[3])
+        spec.blocks = (Lblocks, Mblocks, Nblocks)
+    end
+    if spec.shmem_size == 0
+        spec.shmem_size = 2 * spec.threads[1] * spec.threads[2] *
+                          spec.threads[3] * sizeof(Float64)
+    end
+    @roc groupsize=spec.threads gridsize=spec.blocks shmem=spec.shmem_size stream=spec.stream _parallel_for_amdgpu_LMN(
+        (L, M, N), f, x...)
+    if spec.sync
+        AMDGPU.synchronize(spec.stream)
+    end
+end
+
 function JACC.parallel_reduce(
         ::AMDGPUBackend, N::Integer, op, f::Function, x...; init)
     numThreads = 512

diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
@@ -14,6 +14,14 @@ using .Experimental
 
 JACC.get_backend(::Val{:cuda}) = CUDABackend()
 
+default_stream() = CUDA.stream()
+
+JACC.default_stream(::Type{CUDABackend}) = default_stream()
+
+function JACC.synchronize(::CUDABackend; stream = default_stream())
+    CUDA.synchronize(stream)
+end
+
 @inline kernel_args(args...) = cudaconvert.((args))
 
 @inline function kernel_maxthreads(kernel_function, kargs)
@@ -23,15 +31,36 @@ JACC.get_backend(::Val{:cuda}) = CUDABackend()
     return (p_kernel, CUDA.maxthreads(p_kernel))
 end
 
-function JACC.parallel_for(
-        ::CUDABackend, N::I, f::F, x...) where {I <: Integer, F <: Function}
+function JACC.parallel_for(::CUDABackend, N::Integer, f::Function, x...)
     kargs = kernel_args(N, f, x...)
     kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda, kargs)
     threads = min(N, maxThreads)
     blocks = ceil(Int, N / threads)
     shmem_size = attribute(
         device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
-    kernel(kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
+    CUDA.@sync kernel(
+        kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
+end
+
+function JACC.parallel_for(
+        spec::LaunchSpec{CUDABackend}, N::Integer, f::Function, x...)
+    kargs = kernel_args(N, f, x...)
+    kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda, kargs)
+    if spec.threads == 0
+        spec.threads = min(N, maxThreads)
+    end
+    if spec.blocks == 0
+        spec.blocks = ceil(Int, N / spec.threads)
+    end
+    if spec.shmem_size == 0
+        spec.shmem_size = attribute(
+            device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    end
+    kernel(kargs...; threads = spec.threads, blocks = spec.blocks,
+        shmem = spec.shmem_size, stream = spec.stream)
+    if spec.sync
+        CUDA.synchronize(spec.stream)
+    end
 end
 
 abstract type BlockIndexer2D end
@@ -53,15 +82,14 @@ function (blkIter::BlockIndexerSwapped)(blockIdx, blockDim, threadIdx)
 end
 
 function JACC.parallel_for(
-        ::CUDABackend, (M, N)::Tuple{I, I}, f::F, x...) where {
-        I <: Integer, F <: Function}
+        ::CUDABackend, (M, N)::NTuple{2, Integer}, f::Function, x...)
     #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
     # potential benefit from using shared memory.
 
     dev = CUDA.device()
     maxBlocks = (
         x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X),
-        y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y),
+        y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
     )
     indexer = BlockIndexerBasic()
     m, n = (M, N)
@@ -85,20 +113,72 @@ function JACC.parallel_for(
     y_thr = min(
         blockAttrs.max_y,
         ceil(Int, blockAttrs.total / x_thr),
-        ceil(Int, maxThreads / x_thr),
+        ceil(Int, maxThreads / x_thr)
     )
     threads = (x_thr, y_thr)
     blocks = (cld(m, x_thr), cld(n, y_thr))
 
-    shmem_size = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    shmem_size = attribute(
+        dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
 
-    kernel(kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
+    CUDA.@sync kernel(
+        kargs...; threads = threads, blocks = blocks, shmem = shmem_size)
 end
 
 function JACC.parallel_for(
-        ::CUDABackend, (L, M, N)::Tuple{I, I, I}, f::F,
-        x...) where {
-        I <: Integer, F <: Function}
+        spec::LaunchSpec{CUDABackend}, (M, N)::NTuple{2, Integer}, f::Function, x...)
+    dev = CUDA.device()
+    indexer = BlockIndexerBasic()
+    m, n = (M, N)
+
+    kargs = kernel_args(indexer, (M, N), f, x...)
+    kernel, maxThreads = kernel_maxthreads(_parallel_for_cuda_MN, kargs)
+
+    if spec.threads == 0
+        maxBlocks = (
+            x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X),
+            y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)
+        )
+        if M < N && maxBlocks.x > maxBlocks.y
+            indexer = BlockIndexerSwapped()
+            m, n = (N, M)
+        end
+        blockAttrs = (
+            max_x = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X),
+            max_y = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y),
+            total = attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+        x_thr = min(
+            blockAttrs.max_x,
+            nextpow(2, m / blockAttrs.total + 1),
+            blockAttrs.total,
+            maxThreads
+        )
+        y_thr = min(
+            blockAttrs.max_y,
+            ceil(Int, blockAttrs.total / x_thr),
+            ceil(Int, maxThreads / x_thr)
+        )
+        spec.threads = (x_thr, y_thr)
+    end
+
+    if spec.blocks == 0
+        spec.blocks = (cld(m, spec.threads[1]), cld(n, spec.threads[2]))
+    end
+
+    if spec.shmem_size == 0
+        spec.shmem_size = attribute(
+            dev, CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    end
+
+    kernel(kargs...; threads = spec.threads, blocks = spec.blocks,
+        shmem = spec.shmem_size, stream = spec.stream)
+    if spec.sync
+        CUDA.synchronize(spec.stream)
+    end
+end
+
+function JACC.parallel_for(
+        ::CUDABackend, (L, M, N)::NTuple{3, Integer}, f::Function, x...)
     #To use JACC.shared, it is recommended to use a high number of threads per block to maximize the
     # potential benefit from using shared memory.
     numThreads = 32
@@ -115,6 +195,33 @@ function JACC.parallel_for(
         (L, M, N), f, x...)
 end
 
+function JACC.parallel_for(
+        spec::LaunchSpec{CUDABackend}, (L, M, N)::NTuple{3, Integer}, f::Function,
+        x...)
+    if spec.threads == 0
+        numThreads = 32
+        Lthreads = min(L, numThreads)
+        Mthreads = min(M, numThreads)
+        Nthreads = 1
+        spec.threads = (Lthreads, Mthreads, Nthreads)
+    end
+    if spec.blocks == 0
+        Lblocks = ceil(Int, L / spec.threads[1])
+        Mblocks = ceil(Int, M / spec.threads[2])
+        Nblocks = ceil(Int, N / spec.threads[3])
+        spec.blocks = (Lblocks, Mblocks, Nblocks)
+    end
+    if spec.shmem_size == 0
+        spec.shmem_size = attribute(
+            device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)
+    end
+    @cuda threads=spec.threads blocks=spec.blocks shmem=spec.shmem_size stream=spec.stream _parallel_for_cuda_LMN(
+        (L, M, N), f, x...)
+    if spec.sync
+        CUDA.synchronize(spec.stream)
+    end
+end
+
 function JACC.parallel_reduce(
         ::CUDABackend, N::Integer, op, f::Function, x...; init)
     ret_inst = CUDA.CuArray{typeof(init)}(undef, 0)
@@ -249,7 +356,7 @@ function reduce_kernel_cuda(N, op, red, ret)
 end
 
 function _parallel_reduce_cuda_MN((M, N), op, ret, f, x...)
-    shared_mem = CuDynamicSharedArray(eltype(ret), 16*16)
+    shared_mem = CuDynamicSharedArray(eltype(ret), 16 * 16)
     i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
     ti = threadIdx().x
@@ -300,7 +407,7 @@ function _parallel_reduce_cuda_MN((M, N), op, ret, f, x...)
 end
 
 function reduce_kernel_cuda_MN((M, N), op, red, ret)
-    shared_mem = CuDynamicSharedArray(eltype(ret), 16*16)
+    shared_mem = CuDynamicSharedArray(eltype(ret), 16 * 16)
     i = threadIdx().x
     j = threadIdx().y
     ii = i