From 6a31bce481d1b35242052bacaa7eb15a569e670b Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Thu, 9 May 2024 13:20:57 -0400
Subject: [PATCH 1/2] Added JACC BLAS module. Only dot and axpy for the
 momemnt. Added tests for JACC.BLAS in all tests for the backends. Fixed small
 bug in the acceletor backends for the parallel_reduce unidimensional
 implementations

---
 Project.toml                 |  1 +
 ext/JACCAMDGPU/JACCAMDGPU.jl |  2 +-
 ext/JACCCUDA/JACCCUDA.jl     |  8 ++++----
 ext/JACCONEAPI/JACCONEAPI.jl |  2 +-
 src/JACC.jl                  |  3 +++
 src/JACCBLAS.jl              | 21 +++++++++++++++++++++
 test/tests_amdgpu.jl         | 33 +++++++++++++++++++++++++++++++++
 test/tests_cuda.jl           | 33 +++++++++++++++++++++++++++++++++
 test/tests_oneapi.jl         | 35 +++++++++++++++++++++++++++++++++++
 test/tests_threads.jl        | 33 +++++++++++++++++++++++++++++++++
 10 files changed, 165 insertions(+), 6 deletions(-)
 create mode 100644 src/JACCBLAS.jl

diff --git a/Project.toml b/Project.toml
index 5e9b0ac..add4aaa 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["pedrovalerolara <valerolarap@ornl.gov>", "williamfgc <williamfgc@yah
 version = "0.0.4"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 
diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
index 0fe2f4a..50d04c1 100644
--- a/ext/JACCAMDGPU/JACCAMDGPU.jl
+++ b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -133,7 +133,7 @@ function reduce_kernel_amdgpu(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp
diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
index 95c12dc..7751992 100644
--- a/ext/JACCCUDA/JACCCUDA.jl
+++ b/ext/JACCCUDA/JACCCUDA.jl
@@ -128,7 +128,7 @@ end
 
 function reduce_kernel_cuda(N, red, ret)
     shared_mem = @cuDynamicSharedMem(Float64, 512)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = threadIdx().x
     ii = i
     tmp::Float64 = 0.0
     if N > 512
@@ -136,10 +136,10 @@ function reduce_kernel_cuda(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
-        tmp = @inbounds red[i]
+    elseif (i <= N)
+          tmp = @inbounds red[i]
     end
-    shared_mem[i] = tmp
+    shared_mem[threadIdx().x] = tmp
     sync_threads()
     if (i <= 256)
         shared_mem[i] += shared_mem[i + 256]
diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl
index 3867023..98de3c2 100644
--- a/ext/JACCONEAPI/JACCONEAPI.jl
+++ b/ext/JACCONEAPI/JACCONEAPI.jl
@@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 256
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp
diff --git a/src/JACC.jl b/src/JACC.jl
index 89b23bd..e88c06e 100644
--- a/src/JACC.jl
+++ b/src/JACC.jl
@@ -8,6 +8,9 @@ include("helper.jl")
 # overloaded array functions
 include("array.jl")
 
+include("JACCBLAS.jl")
+using .BLAS
+
 export Array, @atomic
 export parallel_for
 
diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl
new file mode 100644
index 0000000..387b2ad
--- /dev/null
+++ b/src/JACCBLAS.jl
@@ -0,0 +1,21 @@
+module BLAS
+
+using JACC
+
+function _axpy(i, alpha, x, y)
+  @inbounds x[i] += alpha * y[i]
+end
+
+function _dot(i, x, y)
+  return @inbounds x[i] * y[i]
+end
+
+function axpy(n::I, alpha, x, y) where {I<:Integer}
+  JACC.parallel_for(n, _axpy, alpha, x, y)
+end
+
+function dot(n::I, x, y) where {I<:Integer}
+  JACC.parallel_reduce(n, _dot, x, y)
+end
+
+end # module BLAS
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
index 7987228..d1a495b 100644
--- a/test/tests_amdgpu.jl
+++ b/test/tests_amdgpu.jl
@@ -96,3 +96,36 @@ end
     JACC.parallel_for(N, minus_one, x)
     @test zeros(N)≈Array(x) rtol=1e-5
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+    
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+    
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl
index b07536c..a1686be 100644
--- a/test/tests_cuda.jl
+++ b/test/tests_cuda.jl
@@ -133,3 +133,36 @@ end
 #         C[i] = A[i] + B[i]
 #     end
 # end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+    
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+    
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl
index 1ba6f10..bc4af4f 100644
--- a/test/tests_oneapi.jl
+++ b/test/tests_oneapi.jl
@@ -48,3 +48,38 @@ end
 
     @test Array(x_device)≈x_expected rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    SIZE = Int32(1_000)
+    x = ones(Float32, SIZE)
+    y = ones(Float32, SIZE)
+    jx = JACC.ones(Float32, SIZE)
+    jy = JACC.ones(Float32, SIZE)
+    alpha = Float32(2.0)
+    
+    seq_axpy(SIZE, alpha, x, y)
+    ref_result = seq_dot(SIZE, x, y)
+    
+    JACC.BLAS.axpy(SIZE, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(SIZE, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
+
diff --git a/test/tests_threads.jl b/test/tests_threads.jl
index bc5b944..80953a0 100644
--- a/test/tests_threads.jl
+++ b/test/tests_threads.jl
@@ -277,3 +277,36 @@ end
 
     @test f2≈df2 rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+ 
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = jresult[1]     
+    
+    @test result≈ref_result rtol=1e-8
+
+end

From 02845729caee11fe4aec5682293292c6127b3f1a Mon Sep 17 00:00:00 2001
From: pedrovalerolara <valerolarap@ornl.com>
Date: Thu, 9 May 2024 13:20:57 -0400
Subject: [PATCH 2/2] Fixed previous commit: removed line in Project.toml and
 fixed bug on AMDGPU backend for parallel_reduce implementation. Previous
 commint was: Added JACC BLAS module ...

---
 ext/JACCAMDGPU/JACCAMDGPU.jl | 12 ++++++------
 ext/JACCCUDA/JACCCUDA.jl     |  8 ++++----
 ext/JACCONEAPI/JACCONEAPI.jl |  2 +-
 src/JACC.jl                  |  3 +++
 src/JACCBLAS.jl              | 21 +++++++++++++++++++++
 test/tests_amdgpu.jl         | 33 +++++++++++++++++++++++++++++++++
 test/tests_cuda.jl           | 33 +++++++++++++++++++++++++++++++++
 test/tests_oneapi.jl         | 35 +++++++++++++++++++++++++++++++++++
 test/tests_threads.jl        | 33 +++++++++++++++++++++++++++++++++
 9 files changed, 169 insertions(+), 11 deletions(-)
 create mode 100644 src/JACCBLAS.jl

diff --git a/ext/JACCAMDGPU/JACCAMDGPU.jl b/ext/JACCAMDGPU/JACCAMDGPU.jl
index 0fe2f4a..abf7e5d 100644
--- a/ext/JACCAMDGPU/JACCAMDGPU.jl
+++ b/ext/JACCAMDGPU/JACCAMDGPU.jl
@@ -34,7 +34,7 @@ function JACC.parallel_reduce(
     @roc groupsize=threads gridsize=blocks _parallel_reduce_amdgpu(
         N, ret, f, x...)
     AMDGPU.synchronize()
-    @roc groupsize=threads gridsize=threads reduce_kernel_amdgpu(
+    @roc groupsize=threads gridsize=1 reduce_kernel_amdgpu(
         blocks, ret, rret)
     AMDGPU.synchronize()
     return rret
@@ -52,7 +52,7 @@ function JACC.parallel_reduce(
     @roc groupsize=(Mthreads, Nthreads) gridsize=(Mblocks, Nblocks) _parallel_reduce_amdgpu_MN(
         (M, N), ret, f, x...)
     AMDGPU.synchronize()
-    @roc groupsize=(Mthreads, Nthreads) gridsize=(Mthreads, Nthreads) reduce_kernel_amdgpu_MN(
+    @roc groupsize=(Mthreads, Nthreads) gridsize=(1, 1) reduce_kernel_amdgpu_MN(
         (Mblocks, Nblocks), ret, rret)
     AMDGPU.synchronize()
     return rret
@@ -125,7 +125,7 @@ end
 
 function reduce_kernel_amdgpu(N, red, ret)
     shared_mem = @ROCStaticLocalArray(Float64, 512)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
+    i = workitemIdx().x
     ii = i
     tmp::Float64 = 0.0
     if N > 512
@@ -133,7 +133,7 @@ function reduce_kernel_amdgpu(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp
@@ -223,8 +223,8 @@ end
 
 function reduce_kernel_amdgpu_MN((M, N), red, ret)
     shared_mem = @ROCStaticLocalArray(Float64, 256)
-    i = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
-    j = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
+    i = workitemIdx().x
+    j = workitemIdx().y
     ii = i
     jj = j
 
diff --git a/ext/JACCCUDA/JACCCUDA.jl b/ext/JACCCUDA/JACCCUDA.jl
index 95c12dc..7751992 100644
--- a/ext/JACCCUDA/JACCCUDA.jl
+++ b/ext/JACCCUDA/JACCCUDA.jl
@@ -128,7 +128,7 @@ end
 
 function reduce_kernel_cuda(N, red, ret)
     shared_mem = @cuDynamicSharedMem(Float64, 512)
-    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    i = threadIdx().x
     ii = i
     tmp::Float64 = 0.0
     if N > 512
@@ -136,10 +136,10 @@ function reduce_kernel_cuda(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 512
         end
-    else
-        tmp = @inbounds red[i]
+    elseif (i <= N)
+          tmp = @inbounds red[i]
     end
-    shared_mem[i] = tmp
+    shared_mem[threadIdx().x] = tmp
     sync_threads()
     if (i <= 256)
         shared_mem[i] += shared_mem[i + 256]
diff --git a/ext/JACCONEAPI/JACCONEAPI.jl b/ext/JACCONEAPI/JACCONEAPI.jl
index 3867023..98de3c2 100644
--- a/ext/JACCONEAPI/JACCONEAPI.jl
+++ b/ext/JACCONEAPI/JACCONEAPI.jl
@@ -125,7 +125,7 @@ function reduce_kernel_oneapi(N, red, ret)
             tmp += @inbounds red[ii]
             ii += 256
         end
-    else
+    elseif (i <= N)
         tmp = @inbounds red[i]
     end
     shared_mem[i] = tmp
diff --git a/src/JACC.jl b/src/JACC.jl
index 89b23bd..e88c06e 100644
--- a/src/JACC.jl
+++ b/src/JACC.jl
@@ -8,6 +8,9 @@ include("helper.jl")
 # overloaded array functions
 include("array.jl")
 
+include("JACCBLAS.jl")
+using .BLAS
+
 export Array, @atomic
 export parallel_for
 
diff --git a/src/JACCBLAS.jl b/src/JACCBLAS.jl
new file mode 100644
index 0000000..387b2ad
--- /dev/null
+++ b/src/JACCBLAS.jl
@@ -0,0 +1,21 @@
+module BLAS
+
+using JACC
+
+function _axpy(i, alpha, x, y)
+  @inbounds x[i] += alpha * y[i]
+end
+
+function _dot(i, x, y)
+  return @inbounds x[i] * y[i]
+end
+
+function axpy(n::I, alpha, x, y) where {I<:Integer}
+  JACC.parallel_for(n, _axpy, alpha, x, y)
+end
+
+function dot(n::I, x, y) where {I<:Integer}
+  JACC.parallel_reduce(n, _dot, x, y)
+end
+
+end # module BLAS
diff --git a/test/tests_amdgpu.jl b/test/tests_amdgpu.jl
index 7987228..d1a495b 100644
--- a/test/tests_amdgpu.jl
+++ b/test/tests_amdgpu.jl
@@ -96,3 +96,36 @@ end
     JACC.parallel_for(N, minus_one, x)
     @test zeros(N)≈Array(x) rtol=1e-5
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+    
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+    
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_cuda.jl b/test/tests_cuda.jl
index b07536c..a1686be 100644
--- a/test/tests_cuda.jl
+++ b/test/tests_cuda.jl
@@ -133,3 +133,36 @@ end
 #         C[i] = A[i] + B[i]
 #     end
 # end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+    
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+    
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
diff --git a/test/tests_oneapi.jl b/test/tests_oneapi.jl
index 1ba6f10..bc4af4f 100644
--- a/test/tests_oneapi.jl
+++ b/test/tests_oneapi.jl
@@ -48,3 +48,38 @@ end
 
     @test Array(x_device)≈x_expected rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+    
+    SIZE = Int32(1_000)
+    x = ones(Float32, SIZE)
+    y = ones(Float32, SIZE)
+    jx = JACC.ones(Float32, SIZE)
+    jy = JACC.ones(Float32, SIZE)
+    alpha = Float32(2.0)
+    
+    seq_axpy(SIZE, alpha, x, y)
+    ref_result = seq_dot(SIZE, x, y)
+    
+    JACC.BLAS.axpy(SIZE, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(SIZE, jx, jy)
+    result = Array(jresult)     
+    
+    @test result[1]≈ref_result rtol=1e-8
+
+end
+
diff --git a/test/tests_threads.jl b/test/tests_threads.jl
index bc5b944..80953a0 100644
--- a/test/tests_threads.jl
+++ b/test/tests_threads.jl
@@ -277,3 +277,36 @@ end
 
     @test f2≈df2 rtol=1e-1
 end
+
+@testset "JACC.BLAS" begin
+    
+    x = ones(1_000)
+    y = ones(1_000)
+    jx = JACC.ones(1_000)
+    jy = JACC.ones(1_000)
+    alpha = 2.0
+
+    function seq_axpy(N, alpha, x, y)
+        for i in 1:N
+            @inbounds x[i] += alpha * y[i]
+        end
+    end
+    
+    function seq_dot(N, x, y)
+        r = 0.0
+        for i in 1:N
+            @inbounds r += x[i] * y[i]
+        end
+        return r
+    end
+ 
+    seq_axpy(1_000, alpha, x, y)
+    ref_result = seq_dot(1_000, x, y)
+
+    JACC.BLAS.axpy(1_000, alpha, jx, jy)
+    jresult = JACC.BLAS.dot(1_000, jx, jy)
+    result = jresult[1]     
+    
+    @test result≈ref_result rtol=1e-8
+
+end