From 6a31bce481d1b35242052bacaa7eb15a569e670b Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Thu, 9 May 2024 13:20:57 -0400 Subject: [PATCH 1/2] Added JACC BLAS module. Only dot and axpy for the momemnt. Added tests for JACC.BLAS in all tests for the backends. Fixed small bug in the acceletor backends for the parallel_reduce unidimensional implementations --- Project.toml | 1 + ext/JACCAMDGPU/JACCAMDGPU.jl | 2 +- ext/JACCCUDA/JACCCUDA.jl | 8 ++++---- ext/JACCONEAPI/JACCONEAPI.jl | 2 +- src/JACC.jl | 3 +++ src/JACCBLAS.jl | 21 +++++++++++++++++++++ test/tests_amdgpu.jl | 33 +++++++++++++++++++++++++++++++++ test/tests_cuda.jl | 33 +++++++++++++++++++++++++++++++++ test/tests_oneapi.jl | 35 +++++++++++++++++++++++++++++++++++ test/tests_threads.jl | 33 +++++++++++++++++++++++++++++++++ 10 files changed, 165 insertions(+), 6 deletions(-) create mode 100644 src/JACCBLAS.jl diff --git a/Project.toml b/Project.toml index 5e9b0ac..add4aaa 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["pedrovalerolara ", "williamfgc 512 @@ -136,10 +136,10 @@ function reduce_kernel_cuda(N, red, ret) tmp += @inbounds red[ii] ii += 512 end - else - tmp = @inbounds red[i] + elseif (i <= N) + tmp = @inbounds red[i] end - shared_mem[i] = tmp + shared_mem[threadIdx().x] = tmp sync_threads() if (i <= 256) shared_mem[i] += shared_mem[i + 256] diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index 3867023..98de3c2 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret) tmp += @inbounds red[ii] ii += 256 end - else + elseif (i <= N) tmp = @inbounds red[i] end shared_mem[i] = tmp diff --git a/src/JACC.jl b/src/JACC.jl index 89b23bd..e88c06e 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -8,6 +8,9 @@ include("helper.jl") # overloaded array functions include("array.jl") +include("JACCBLAS.jl") +using .BLAS + export Array, @atomic export parallel_for diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl new file mode 100644 index 0000000..387b2ad --- /dev/null +++ b/src/JACCBLAS.jl @@ -0,0 +1,21 @@ +module BLAS + +using JACC + +function _axpy(i, alpha, x, y) + @inbounds x[i] += alpha * y[i] +end + +function _dot(i, x, y) + return @inbounds x[i] * y[i] +end + +function axpy(n::I, alpha, x, y) where {I<:Integer} + JACC.parallel_for(n, _axpy, alpha, x, y) +end + +function dot(n::I, x, y) where {I<:Integer} + JACC.parallel_reduce(n, _dot, x, y) +end + +end # module BLAS diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index 7987228..d1a495b 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -96,3 +96,36 @@ end JACC.parallel_for(N, minus_one, x) @test zeros(N)≈Array(x) rtol=1e-5 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl index b07536c..a1686be 100644 --- a/test/tests_cuda.jl +++ b/test/tests_cuda.jl @@ -133,3 +133,36 @@ end # C[i] = A[i] + B[i] # end # end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl index 1ba6f10..bc4af4f 100644 --- a/test/tests_oneapi.jl +++ b/test/tests_oneapi.jl @@ -48,3 +48,38 @@ end @test Array(x_device)≈x_expected rtol=1e-1 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + SIZE = Int32(1_000) + x = ones(Float32, SIZE) + y = ones(Float32, SIZE) + jx = JACC.ones(Float32, SIZE) + jy = JACC.ones(Float32, SIZE) + alpha = Float32(2.0) + + seq_axpy(SIZE, alpha, x, y) + ref_result = seq_dot(SIZE, x, y) + + JACC.BLAS.axpy(SIZE, alpha, jx, jy) + jresult = JACC.BLAS.dot(SIZE, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end + diff --git a/test/tests_threads.jl b/test/tests_threads.jl index bc5b944..80953a0 100644 --- a/test/tests_threads.jl +++ b/test/tests_threads.jl @@ -277,3 +277,36 @@ end @test f2≈df2 rtol=1e-1 end + +@testset "JACC.BLAS" begin + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = jresult[1] + + @test result≈ref_result rtol=1e-8 + +end From 02845729caee11fe4aec5682293292c6127b3f1a Mon Sep 17 00:00:00 2001 From: pedrovalerolara Date: Thu, 9 May 2024 13:20:57 -0400 Subject: [PATCH 2/2] Fixed previous commit: removed line in Project.toml and fixed bug on AMDGPU backend for parallel_reduce implementation. Previous commint was: Added JACC BLAS module ... --- ext/JACCAMDGPU/JACCAMDGPU.jl | 12 ++++++------ ext/JACCCUDA/JACCCUDA.jl | 8 ++++---- ext/JACCONEAPI/JACCONEAPI.jl | 2 +- src/JACC.jl | 3 +++ src/JACCBLAS.jl | 21 +++++++++++++++++++++ test/tests_amdgpu.jl | 33 +++++++++++++++++++++++++++++++++ test/tests_cuda.jl | 33 +++++++++++++++++++++++++++++++++ test/tests_oneapi.jl | 35 +++++++++++++++++++++++++++++++++++ test/tests_threads.jl | 33 +++++++++++++++++++++++++++++++++ 9 files changed, 169 insertions(+), 11 deletions(-) create mode 100644 src/JACCBLAS.jl diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl index 0fe2f4a..abf7e5d 100644 --- a/ext/JACCAMDGPU/JACCAMDGPU.jl +++ b/ext/JACCAMDGPU/JACCAMDGPU.jl @@ -34,7 +34,7 @@ function JACC.parallel_reduce( @roc groupsize=threads gridsize=blocks _parallel_reduce_amdgpu( N, ret, f, x...) AMDGPU.synchronize() - @roc groupsize=threads gridsize=threads reduce_kernel_amdgpu( + @roc groupsize=threads gridsize=1 reduce_kernel_amdgpu( blocks, ret, rret) AMDGPU.synchronize() return rret @@ -52,7 +52,7 @@ function JACC.parallel_reduce( @roc groupsize=(Mthreads, Nthreads) gridsize=(Mblocks, Nblocks) _parallel_reduce_amdgpu_MN( (M, N), ret, f, x...) AMDGPU.synchronize() - @roc groupsize=(Mthreads, Nthreads) gridsize=(Mthreads, Nthreads) reduce_kernel_amdgpu_MN( + @roc groupsize=(Mthreads, Nthreads) gridsize=(1, 1) reduce_kernel_amdgpu_MN( (Mblocks, Nblocks), ret, rret) AMDGPU.synchronize() return rret @@ -125,7 +125,7 @@ end function reduce_kernel_amdgpu(N, red, ret) shared_mem = @ROCStaticLocalArray(Float64, 512) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x + i = workitemIdx().x ii = i tmp::Float64 = 0.0 if N > 512 @@ -133,7 +133,7 @@ function reduce_kernel_amdgpu(N, red, ret) tmp += @inbounds red[ii] ii += 512 end - else + elseif (i <= N) tmp = @inbounds red[i] end shared_mem[i] = tmp @@ -223,8 +223,8 @@ end function reduce_kernel_amdgpu_MN((M, N), red, ret) shared_mem = @ROCStaticLocalArray(Float64, 256) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x - j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y + i = workitemIdx().x + j = workitemIdx().y ii = i jj = j diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 95c12dc..7751992 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -128,7 +128,7 @@ end function reduce_kernel_cuda(N, red, ret) shared_mem = @cuDynamicSharedMem(Float64, 512) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = threadIdx().x ii = i tmp::Float64 = 0.0 if N > 512 @@ -136,10 +136,10 @@ function reduce_kernel_cuda(N, red, ret) tmp += @inbounds red[ii] ii += 512 end - else - tmp = @inbounds red[i] + elseif (i <= N) + tmp = @inbounds red[i] end - shared_mem[i] = tmp + shared_mem[threadIdx().x] = tmp sync_threads() if (i <= 256) shared_mem[i] += shared_mem[i + 256] diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index 3867023..98de3c2 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret) tmp += @inbounds red[ii] ii += 256 end - else + elseif (i <= N) tmp = @inbounds red[i] end shared_mem[i] = tmp diff --git a/src/JACC.jl b/src/JACC.jl index 89b23bd..e88c06e 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -8,6 +8,9 @@ include("helper.jl") # overloaded array functions include("array.jl") +include("JACCBLAS.jl") +using .BLAS + export Array, @atomic export parallel_for diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl new file mode 100644 index 0000000..387b2ad --- /dev/null +++ b/src/JACCBLAS.jl @@ -0,0 +1,21 @@ +module BLAS + +using JACC + +function _axpy(i, alpha, x, y) + @inbounds x[i] += alpha * y[i] +end + +function _dot(i, x, y) + return @inbounds x[i] * y[i] +end + +function axpy(n::I, alpha, x, y) where {I<:Integer} + JACC.parallel_for(n, _axpy, alpha, x, y) +end + +function dot(n::I, x, y) where {I<:Integer} + JACC.parallel_reduce(n, _dot, x, y) +end + +end # module BLAS diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index 7987228..d1a495b 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -96,3 +96,36 @@ end JACC.parallel_for(N, minus_one, x) @test zeros(N)≈Array(x) rtol=1e-5 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl index b07536c..a1686be 100644 --- a/test/tests_cuda.jl +++ b/test/tests_cuda.jl @@ -133,3 +133,36 @@ end # C[i] = A[i] + B[i] # end # end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl index 1ba6f10..bc4af4f 100644 --- a/test/tests_oneapi.jl +++ b/test/tests_oneapi.jl @@ -48,3 +48,38 @@ end @test Array(x_device)≈x_expected rtol=1e-1 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + SIZE = Int32(1_000) + x = ones(Float32, SIZE) + y = ones(Float32, SIZE) + jx = JACC.ones(Float32, SIZE) + jy = JACC.ones(Float32, SIZE) + alpha = Float32(2.0) + + seq_axpy(SIZE, alpha, x, y) + ref_result = seq_dot(SIZE, x, y) + + JACC.BLAS.axpy(SIZE, alpha, jx, jy) + jresult = JACC.BLAS.dot(SIZE, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end + diff --git a/test/tests_threads.jl b/test/tests_threads.jl index bc5b944..80953a0 100644 --- a/test/tests_threads.jl +++ b/test/tests_threads.jl @@ -277,3 +277,36 @@ end @test f2≈df2 rtol=1e-1 end + +@testset "JACC.BLAS" begin + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = jresult[1] + + @test result≈ref_result rtol=1e-8 + +end