diff --git a/Project.toml b/Project.toml index 5e9b0ac..add4aaa 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["pedrovalerolara ", "williamfgc 512 @@ -133,7 +133,7 @@ function reduce_kernel_amdgpu(N, red, ret) tmp += @inbounds red[ii] ii += 512 end - else + elseif (i <= N) tmp = @inbounds red[i] end shared_mem[i] = tmp @@ -223,8 +223,8 @@ end function reduce_kernel_amdgpu_MN((M, N), red, ret) shared_mem = @ROCStaticLocalArray(Float64, 256) - i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x - j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y + i = workitemIdx().x + j = workitemIdx().y ii = i jj = j diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl index 95c12dc..7751992 100644 --- a/ext/JACCCUDA/JACCCUDA.jl +++ b/ext/JACCCUDA/JACCCUDA.jl @@ -128,7 +128,7 @@ end function reduce_kernel_cuda(N, red, ret) shared_mem = @cuDynamicSharedMem(Float64, 512) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + i = threadIdx().x ii = i tmp::Float64 = 0.0 if N > 512 @@ -136,10 +136,10 @@ function reduce_kernel_cuda(N, red, ret) tmp += @inbounds red[ii] ii += 512 end - else - tmp = @inbounds red[i] + elseif (i <= N) + tmp = @inbounds red[i] end - shared_mem[i] = tmp + shared_mem[threadIdx().x] = tmp sync_threads() if (i <= 256) shared_mem[i] += shared_mem[i + 256] diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl index 3867023..98de3c2 100644 --- a/ext/JACCONEAPI/JACCONEAPI.jl +++ b/ext/JACCONEAPI/JACCONEAPI.jl @@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret) tmp += @inbounds red[ii] ii += 256 end - else + elseif (i <= N) tmp = @inbounds red[i] end shared_mem[i] = tmp diff --git a/src/JACC.jl b/src/JACC.jl index 89b23bd..e88c06e 100644 --- a/src/JACC.jl +++ b/src/JACC.jl @@ -8,6 +8,9 @@ include("helper.jl") # overloaded array functions include("array.jl") +include("JACCBLAS.jl") +using .BLAS + export Array, @atomic export parallel_for diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl new file mode 100644 index 0000000..387b2ad --- /dev/null +++ b/src/JACCBLAS.jl @@ -0,0 +1,21 @@ +module BLAS + +using JACC + +function _axpy(i, alpha, x, y) + @inbounds x[i] += alpha * y[i] +end + +function _dot(i, x, y) + return @inbounds x[i] * y[i] +end + +function axpy(n::I, alpha, x, y) where {I<:Integer} + JACC.parallel_for(n, _axpy, alpha, x, y) +end + +function dot(n::I, x, y) where {I<:Integer} + JACC.parallel_reduce(n, _dot, x, y) +end + +end # module BLAS diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl index 7987228..d1a495b 100644 --- a/test/tests_amdgpu.jl +++ b/test/tests_amdgpu.jl @@ -96,3 +96,36 @@ end JACC.parallel_for(N, minus_one, x) @test zeros(N)≈Array(x) rtol=1e-5 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl index b07536c..a1686be 100644 --- a/test/tests_cuda.jl +++ b/test/tests_cuda.jl @@ -133,3 +133,36 @@ end # C[i] = A[i] + B[i] # end # end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl index 1ba6f10..bc4af4f 100644 --- a/test/tests_oneapi.jl +++ b/test/tests_oneapi.jl @@ -48,3 +48,38 @@ end @test Array(x_device)≈x_expected rtol=1e-1 end + +@testset "JACC.BLAS" begin + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + SIZE = Int32(1_000) + x = ones(Float32, SIZE) + y = ones(Float32, SIZE) + jx = JACC.ones(Float32, SIZE) + jy = JACC.ones(Float32, SIZE) + alpha = Float32(2.0) + + seq_axpy(SIZE, alpha, x, y) + ref_result = seq_dot(SIZE, x, y) + + JACC.BLAS.axpy(SIZE, alpha, jx, jy) + jresult = JACC.BLAS.dot(SIZE, jx, jy) + result = Array(jresult) + + @test result[1]≈ref_result rtol=1e-8 + +end + diff --git a/test/tests_threads.jl b/test/tests_threads.jl index bc5b944..80953a0 100644 --- a/test/tests_threads.jl +++ b/test/tests_threads.jl @@ -277,3 +277,36 @@ end @test f2≈df2 rtol=1e-1 end + +@testset "JACC.BLAS" begin + + x = ones(1_000) + y = ones(1_000) + jx = JACC.ones(1_000) + jy = JACC.ones(1_000) + alpha = 2.0 + + function seq_axpy(N, alpha, x, y) + for i in 1:N + @inbounds x[i] += alpha * y[i] + end + end + + function seq_dot(N, x, y) + r = 0.0 + for i in 1:N + @inbounds r += x[i] * y[i] + end + return r + end + + seq_axpy(1_000, alpha, x, y) + ref_result = seq_dot(1_000, x, y) + + JACC.BLAS.axpy(1_000, alpha, jx, jy) + jresult = JACC.BLAS.dot(1_000, jx, jy) + result = jresult[1] + + @test result≈ref_result rtol=1e-8 + +end