Skip to content

Commit

Permalink
Merge pull request #69 from williamfgc/amdgpu-fixes
Browse files Browse the repository at this point in the history
Fix AMDGPU
  • Loading branch information
williamfgc authored Apr 15, 2024
2 parents 33f9f7e + 4612271 commit a2a0804
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 46 deletions.
20 changes: 10 additions & 10 deletions ext/JACCAMDGPU/JACCAMDGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ function JACC.parallel_for(N::I, f::F, x::Vararg{Union{<:Number,<:ROCArray}}) wh
numThreads = 512
threads = min(N, numThreads)
blocks = ceil(Int, N / threads)
@roc groupsize = threads gridsize = threads*blocks _parallel_for_amdgpu(f, x...)
# AMDGPU.synchronize()
@roc groupsize = threads gridsize = blocks _parallel_for_amdgpu(f, x...)
AMDGPU.synchronize()
end

function JACC.parallel_for((M, N)::Tuple{I,I}, f::F, x::Vararg{Union{<:Number,<:ROCArray}}) where {I<:Integer,F<:Function}
Expand All @@ -16,8 +16,8 @@ function JACC.parallel_for((M, N)::Tuple{I,I}, f::F, x::Vararg{Union{<:Number,<:
Nthreads = min(N, numThreads)
Mblocks = ceil(Int, M / Mthreads)
Nblocks = ceil(Int, N / Nthreads)
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks*Mthreads, Nblocks*Nthreads) _parallel_for_amdgpu_MN(f, x...)
# AMDGPU.synchronize()
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) _parallel_for_amdgpu_MN(f, x...)
AMDGPU.synchronize()
end

function JACC.parallel_reduce(N::I, f::F, x::Vararg{Union{<:Number,<:ROCArray}}) where {I<:Integer,F<:Function}
Expand All @@ -26,10 +26,10 @@ function JACC.parallel_reduce(N::I, f::F, x::Vararg{Union{<:Number,<:ROCArray}})
blocks = ceil(Int, N / threads)
ret = AMDGPU.zeros(Float64, blocks)
rret = AMDGPU.zeros(Float64, 1)
@roc groupsize = threads gridsize = threads*blocks _parallel_reduce_amdgpu(N, ret, f, x...)
#AMDGPU.synchronize()
@roc groupsize = threads gridsize = blocks _parallel_reduce_amdgpu(N, ret, f, x...)
AMDGPU.synchronize()
@roc groupsize = threads gridsize = threads reduce_kernel_amdgpu(blocks, ret, rret)
#AMDGPU.synchronize()
AMDGPU.synchronize()
return rret

end
Expand All @@ -42,10 +42,10 @@ function JACC.parallel_reduce((M, N)::Tuple{I,I}, f::F, x::Vararg{Union{<:Number
Nblocks = ceil(Int, N / Nthreads)
ret = AMDGPU.zeros(Float64, (Mblocks, Nblocks))
rret = AMDGPU.zeros(Float64, 1)
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks*Mthreads, Nblocks*Nthreads) _parallel_reduce_amdgpu_MN((M, N), ret, f, x...)
#AMDGPU.synchronize()
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mblocks, Nblocks) _parallel_reduce_amdgpu_MN((M, N), ret, f, x...)
AMDGPU.synchronize()
@roc groupsize = (Mthreads, Nthreads) gridsize = (Mthreads, Nthreads) reduce_kernel_amdgpu_MN((Mblocks, Nblocks), ret, rret)
#AMDGPU.synchronize()
AMDGPU.synchronize()
return rret
end

Expand Down
108 changes: 72 additions & 36 deletions test/tests_amdgpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,62 +25,98 @@ end

end

@testset "AXPY" begin

function axpy(i, alpha, x, y)
@inbounds x[i] += alpha * y[i]
end

function seq_axpy(N, alpha, x, y)
@inbounds for i in 1:N
x[i] += alpha * y[i]
end
end

N = 10
# Generate random vectors x and y of length N for the interval [0, 100]
x = round.(rand(Float32, N) * 100)
y = round.(rand(Float32, N) * 100)
alpha = 2.5

x_device = JACC.Array(x)
y_device = JACC.Array(y)
JACC.parallel_for(N, axpy, alpha, x_device, y_device)

x_expected = x
seq_axpy(N, alpha, x_expected, y)

@test Array(x_device) x_expected rtol = 1e-1
end

@testset "CG" begin

function matvecmul(i, a1, a2, a3, x, y, SIZE)
if i == 1
y[i] = a2[i] * x[i] + a1[i] * x[i+1]
elseif i == SIZE
y[i] = a3[i] * x[i-1] + a2[i] * x[i]
elseif i > 1 && i < SIZE
y[i] = a3[i] * x[i-1] + a1[i] * +x[i] + a1[i] * +x[i+1]
end
end

function dot(i, x, y)
@inbounds return x[i] * y[i]
end

function axpy(i, alpha, x, y)
@inbounds x[i] += alpha[1, 1] * y[i]
end

SIZE = 10
a0 = ones(SIZE)
a1 = ones(SIZE)
a2 = ones(SIZE)
r = ones(SIZE)
p = ones(SIZE)
s = zeros(SIZE)
x = zeros(SIZE)
r_old = zeros(SIZE)
r_aux = zeros(SIZE)
a1 = a1 * 4
r = r * 0.5
p = p * 0.5
global cond = one(Float64)

while cond[1, 1] >= 1e-14

r_old = copy(r)

# A + B
# @testset VectorAdd begin
# N = 1024
# A = JACC.Array{Float32}(1, N)
# B = JACC.Array{Float32}(1, N)
# C = A + B
# end
JACC.parallel_for(SIZE, matvecmul, a0, a1, a2, p, s, SIZE)

# @testset VectorAddLoop begin
# N = 1024
# A = JACC.Array{Float32}(1, N)
# B = JACC.Array{Float32}(1, N)
alpha0 = JACC.parallel_reduce(SIZE, dot, r, r)
alpha1 = JACC.parallel_reduce(SIZE, dot, p, s)

# @jacc
# for i in 1:N
# C[i] = A[i] + B[i]
# end
# end
alpha = alpha0 / alpha1
negative_alpha = alpha * (-1.0)

# @testset VectorReduce begin
# N = 1024
# A = JACC.Array{Float32}(1, N)
# B = JACC.Array{Float32}(1, N)
JACC.parallel_for(SIZE, axpy, negative_alpha, r, s)
JACC.parallel_for(SIZE, axpy, alpha, x, p)

# @jacc reduction(C)
# for i in 1:N
# C += A[i] * B[i]
# end
# end
beta0 = JACC.parallel_reduce(SIZE, dot, r, r)
beta1 = JACC.parallel_reduce(SIZE, dot, r_old, r_old)
beta = beta0 / beta1

r_aux = copy(r)

# @testset VectorAddLoopKernel begin
# N = 1024
# A = JACC.Array{Float32}(1, N)
# B = JACC.Array{Float32}(1, N)
JACC.parallel_for(SIZE, axpy, beta, r_aux, p)
ccond = JACC.parallel_reduce(SIZE, dot, r, r)
global cond = ccond
p = copy(r_aux)

# function kernel(i, A, B)
println(cond)

# end
end
@test cond[1, 1] <= 1e-14
end

# @jacc
# for i in 1:N
# C[i] = A[i] + B[i]
# end
# end

0 comments on commit a2a0804

Please sign in to comment.