JuliaORNL · pedrovalerolara · May 9, 2024 · May 9, 2024 · May 9, 2024 · williamfgc
diff --git a/Project.toml b/Project.toml
@@ -4,6 +4,7 @@ authors = ["pedrovalerolara <[email protected]>", "williamfgc <williamfgc@yah
 version = "0.0.4"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 

diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -34,7 +34,7 @@ function JACC.parallel_reduce(
     @roc groupsize=threads gridsize=blocks _parallel_reduce_amdgpu(
         N, ret, f, x...)
     AMDGPU.synchronize()
-    @roc groupsize=threads gridsize=threads reduce_kernel_amdgpu(
+    @roc groupsize=threads gridsize=1 reduce_kernel_amdgpu(
         blocks, ret, rret)
     AMDGPU.synchronize()
     return rret
@@ -52,7 +52,7 @@ function JACC.parallel_reduce(
     @roc groupsize=(Mthreads, Nthreads) gridsize=(Mblocks, Nblocks) _parallel_reduce_amdgpu_MN(
         (M, N), ret, f, x...)
     AMDGPU.synchronize()
-    @roc groupsize=(Mthreads, Nthreads) gridsize=(Mthreads, Nthreads) reduce_kernel_amdgpu_MN(
+    @roc groupsize=(Mthreads, Nthreads) gridsize=(1, 1) reduce_kernel_amdgpu_MN(
         (Mblocks, Nblocks), ret, rret)
     AMDGPU.synchronize()
     return rret
@@ -125,15 +125,15 @@ end
 
 function reduce_kernel_amdgpu(N, red, ret)
     shared_mem = @ROCStaticLocalArray(Float64, 512)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = workitemIdx().x
     ii = i
     tmp::Float64 = 0.0
     if N > 512
         while ii <= N
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp
@@ -223,8 +223,8 @@ end
 
 function reduce_kernel_amdgpu_MN((M, N), red, ret)
     shared_mem = @ROCStaticLocalArray(Float64, 256)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
-    j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    i = workitemIdx().x
+    j = workitemIdx().y
     ii = i
     jj = j
 

diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
@@ -128,18 +128,18 @@ end
 
 function reduce_kernel_cuda(N, red, ret)
     shared_mem = @cuDynamicSharedMem(Float64, 512)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = threadIdx().x
     ii = i
     tmp::Float64 = 0.0
     if N > 512
         while ii <= N
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
-        tmp = @inbounds red[i]
+    elseif (i <= N)
+          tmp = @inbounds red[i]
     end
-    shared_mem[i] = tmp
+    shared_mem[threadIdx().x] = tmp
     sync_threads()
     if (i <= 256)
         shared_mem[i] += shared_mem[i + 256]

diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl
@@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 256
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp

diff --git a/src/JACC.jl b/src/JACC.jl
@@ -8,6 +8,9 @@ include("helper.jl")
 # overloaded array functions
 include("array.jl")
 
+include("JACCBLAS.jl")
+using .BLAS
+
 export Array, @atomic
 export parallel_for
 

diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl
@@ -0,0 +1,21 @@
+module BLAS
+
+using JACC
+
+function _axpy(i, alpha, x, y)
+  @inbounds x[i] += alpha * y[i]
+end
+
+function _dot(i, x, y)
+  return @inbounds x[i] * y[i]
+end
+
+function axpy(n::I, alpha, x, y) where {I<:Integer}
+  JACC.parallel_for(n, _axpy, alpha, x, y)
+end
+
+function dot(n::I, x, y) where {I<:Integer}
+  JACC.parallel_reduce(n, _dot, x, y)
+end
+
+end # module BLAS
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
@@ -96,3 +96,36 @@ end
     JACC.parallel_for(N, minus_one, x)
     @test zeros(N)≈Array(x) rtol=1e-5
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl
@@ -133,3 +133,36 @@ end
 #         C[i] = A[i] + B[i]
 #     end
 # end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl
@@ -48,3 +48,38 @@ end
 
     @test Array(x_device)≈x_expected rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+
+    SIZE = Int32(1_000)
+    x = ones(Float32, SIZE)
+    y = ones(Float32, SIZE)
+    jx = JACC.ones(Float32, SIZE)
+    jy = JACC.ones(Float32, SIZE)
+    alpha = Float32(2.0)
+
+    seq_axpy(SIZE, alpha, x, y)
+    ref_result = seq_dot(SIZE, x, y)
+
+    JACC.BLAS.axpy(SIZE, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(SIZE, jx, jy)
+    result = Array(jresult)     
+
+    @test result[1]≈ref_result rtol=1e-8
+
+end
+
diff --git a/test/tests_threads.jl b/test/tests_threads.jl
@@ -277,3 +277,36 @@ end
 
     @test f2≈df2 rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = jresult[1]     
+
+    @test result≈ref_result rtol=1e-8
+
+end