Skip to content

Commit

Permalink
precommit fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Abdelrahman912 committed Nov 6, 2024
1 parent 6300a4a commit b7301c2
Show file tree
Hide file tree
Showing 22 changed files with 177 additions and 191 deletions.
51 changes: 22 additions & 29 deletions docs/src/literate-tutorials/gpu_qp_heat_equation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,21 @@ using StaticArrays
using SparseArrays


left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.

right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.



left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.

right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.


grid = generate_grid(Quadrilateral, (100, 100),left,right)
grid = generate_grid(Quadrilateral, (100, 100), left, right)


ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)



qr = QuadratureRule{RefQuadrilateral}(Float32,2)
qr = QuadratureRule{RefQuadrilateral}(Float32, 2)



cellvalues = CellValues(Float32,qr, ip)
cellvalues = CellValues(Float32, qr, ip)


dh = DofHandler(grid)
Expand All @@ -43,7 +37,7 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
= getdetJdV(cellvalues, q_point)
## Loop over test shape functions
for i in 1:n_basefuncs
δu = shape_value(cellvalues, q_point, i)
δu = shape_value(cellvalues, q_point, i)
∇δu = shape_gradient(cellvalues, q_point, i)
## Add contribution to fe
fe[i] += δu *
Expand All @@ -59,7 +53,6 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
end



function create_buffers(cellvalues, dh)
f = zeros(ndofs(dh))
K = allocate_matrix(dh)
Expand All @@ -68,21 +61,21 @@ function create_buffers(cellvalues, dh)
n_basefuncs = getnbasefunctions(cellvalues)
Ke = zeros(n_basefuncs, n_basefuncs)
fe = zeros(n_basefuncs)
return (;f, K, assembler, Ke, fe)
return (; f, K, assembler, Ke, fe)
end


# Standard global assembly

function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where {QPiter}
(;f, K, assembler, Ke, fe) = create_buffers(cellvalues,dh)
function assemble_global!(cellvalues, dh::DofHandler, qp_iter::Val{QPiter}) where {QPiter}
(; f, K, assembler, Ke, fe) = create_buffers(cellvalues, dh)
## Loop over all cels
for cell in CellIterator(dh)
fill!(Ke, 0)
fill!(fe, 0)
if QPiter
## reinit!(cellvalues, getcoordinates(cell))
assemble_element_qpiter!(Ke, fe, cellvalues,getcoordinates(cell))
assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell))
else
## Reinitialize cellvalues for this cell
reinit!(cellvalues, cell)
Expand All @@ -96,18 +89,17 @@ function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where
end



## gpu version of element assembly


function assemble_element!(Ke,fe,cv,cell)
function assemble_element!(Ke, fe, cv, cell)
n_basefuncs = getnbasefunctions(cv)
for qv in Ferrite.QuadratureValuesIterator(cv,getcoordinates(cell))
for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell))
## Get the quadrature weight
= getdetJdV(qv)
## Loop over test shape functions
for i in 1:n_basefuncs
δu = shape_value(qv, i)
δu = shape_value(qv, i)
∇u = shape_gradient(qv, i)
## Add contribution to fe
fe[i] += δu *
Expand All @@ -116,18 +108,19 @@ function assemble_element!(Ke,fe,cv,cell)
for j in 1:n_basefuncs
∇δu = shape_gradient(qv, j)
## Add contribution to Ke
Ke[i,j] += (∇δu ∇u) *
Ke[i, j] += (∇δu ∇u) *
end
end
end
return
end


# gpu version of global assembly
function assemble_gpu!(Kgpu,fgpu, cv, dh)
function assemble_gpu!(Kgpu, fgpu, cv, dh)
n_basefuncs = getnbasefunctions(cv)
assembler = start_assemble(Kgpu, fgpu;fillzero=false)
for cell in CellIterator(dh, convert(Int32,n_basefuncs))
assembler = start_assemble(Kgpu, fgpu; fillzero = false)
for cell in CellIterator(dh, convert(Int32, n_basefuncs))
Ke = cellke(cell)
fe = cellfe(cell)
assemble_element!(Ke, fe, cv, cell)
Expand All @@ -142,7 +135,7 @@ n_basefuncs = getnbasefunctions(cellvalues)
## Allocate CPU matrix
## K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);

K = allocate_matrix(SparseMatrixCSC{Float64, Int64},dh);
K = allocate_matrix(SparseMatrixCSC{Float64, Int64}, dh);
f = zeros(ndofs(dh));

# Allocate GPU matrix
Expand All @@ -155,13 +148,13 @@ n_cells = dh |> get_grid |> getncells
# Kernel configuration
## commented to pass the test
##init_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!
cpu_kernel = init_kernel(BackendCPU,n_cells,n_basefuncs,assemble_gpu!, (K,f, cellvalues, dh));
cpu_kernel = init_kernel(BackendCPU, n_cells, n_basefuncs, assemble_gpu!, (K, f, cellvalues, dh));
cpu_kernel()

stassy(cv,dh) = assemble_global!(cv,dh,Val(false))
stassy(cv, dh) = assemble_global!(cv, dh, Val(false))

norm(K)
## commented to pass the test
## norm(Kgpu)
Kstd , Fstd = stassy(cellvalues,dh);
Kstd, Fstd = stassy(cellvalues, dh);
norm(Kstd)
16 changes: 7 additions & 9 deletions ext/GPU/CUDAKernelLauncher.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@


function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
if CUDA.functional()
return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
else
Expand All @@ -17,22 +15,22 @@ Launch a CUDA kernel with the given configuration.
Arguments:
- `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
"""
function Ferrite.launch!(kernel::LazyKernel{Ti,BackendCUDA}) where Ti
function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
n_cells = kernel.n_cells
n_basefuncs = kernel.n_basefuncs
ker = kernel.kernel
args = kernel.args
kernel = @cuda launch=false ker(args...)
kernel = @cuda launch = false ker(args...)
config = launch_configuration(kernel.fun)
threads = convert(Ti,min(n_cells, config.threads,256))
shared_mem = _calculate_shared_memory(threads ,n_basefuncs)
threads = convert(Ti, min(n_cells, config.threads, 256))
shared_mem = _calculate_shared_memory(threads, n_basefuncs)
blocks = _calculate_nblocks(threads, n_cells)
kernel(args...; threads, blocks, shmem=shared_mem)
return kernel(args...; threads, blocks, shmem = shared_mem)
end


function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
return sizeof(Float32) * (threads) * ( n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
return sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
end


Expand Down
29 changes: 13 additions & 16 deletions ext/GPU/adapt.jl
Original file line number Diff line number Diff line change
@@ -1,54 +1,51 @@
# This file defines the adapt_structure function, which is used to adapt custom structures to be used on the GPU.

function Adapt.adapt_structure(to,cv::CellValues)
fv = Adapt.adapt(to,StaticInterpolationValues(cv.fun_values))
gm =Adapt.adapt(to,StaticInterpolationValues(cv.geo_mapping))
weights =Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)))
Ferrite.StaticCellValues(fv,gm, weights)
function Adapt.adapt_structure(to, cv::CellValues)
fv = Adapt.adapt(to, StaticInterpolationValues(cv.fun_values))
gm = Adapt.adapt(to, StaticInterpolationValues(cv.geo_mapping))
weights = Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)))
return Ferrite.StaticCellValues(fv, gm, weights)
end


function Adapt.adapt_structure(to, iter::Ferrite.QuadratureValuesIterator)
cv = Adapt.adapt_structure(to, iter.v)
cell_coords = Adapt.adapt_structure(to, iter.cell_coords)
Ferrite.QuadratureValuesIterator(cv,cell_coords)
return Ferrite.QuadratureValuesIterator(cv, cell_coords)
end

function Adapt.adapt_structure(to, qv::Ferrite.StaticQuadratureValues)
det = Adapt.adapt_structure(to, qv.detJdV)
N = Adapt.adapt_structure(to, qv.N)
dNdx = Adapt.adapt_structure(to, qv.dNdx)
M = Adapt.adapt_structure(to, qv.M)
Ferrite.StaticQuadratureValues(det,N,dNdx,M)
return Ferrite.StaticQuadratureValues(det, N, dNdx, M)
end
function Adapt.adapt_structure(to, qv::StaticQuadratureView)
mapping = Adapt.adapt_structure(to, qv.mapping)
cell_coords = Adapt.adapt_structure(to, qv.cell_coords |> cu)
q_point = Adapt.adapt_structure(to, qv.q_point)
cv = Adapt.adapt_structure(to, qv.cv)
StaticQuadratureView(mapping,cell_coords,q_point,cv)
return StaticQuadratureView(mapping, cell_coords, q_point, cv)
end

function Adapt.adapt_structure(to, grid::Grid)
# map Int64 to Int32 to reduce number of registers
cu_cells = grid.cells .|> (x -> Int32.(x.nodes)) .|> Quadrilateral |> cu
cells = Adapt.adapt_structure(to, cu_cells)
nodes = Adapt.adapt_structure(to, cu(grid.nodes))
GPUGrid(cells,nodes)
return GPUGrid(cells, nodes)
end


function Adapt.adapt_structure(to,iterator::CUDACellIterator)
function Adapt.adapt_structure(to, iterator::CUDACellIterator)
grid = Adapt.adapt_structure(to, iterator.grid)
dh = Adapt.adapt_structure(to, iterator.dh)
ncells = Adapt.adapt_structure(to, iterator.n_cells)
GPUCellIterator(dh,grid, ncells)
return GPUCellIterator(dh, grid, ncells)
end





function _get_ndofs_cell(dh::DofHandler)
ndofs_cell = [Int32(Ferrite.ndofs_per_cell(dh, i)) for i in 1:(dh |> Ferrite.get_grid |> Ferrite.getncells)]
return ndofs_cell
Expand All @@ -69,12 +66,12 @@ function Adapt.adapt_structure(to, dh::DofHandler)
offsets = Adapt.adapt_structure(to, dh.cell_dofs_offset .|> Int32 |> cu)
nodes = Adapt.adapt_structure(to, dh.grid.nodes |> cu)
ndofs_cell = Adapt.adapt_structure(to, _get_ndofs_cell(dh) |> cu)
GPUDofHandler(cell_dofs, GPUGrid(cells,nodes),offsets, ndofs_cell)
return GPUDofHandler(cell_dofs, GPUGrid(cells, nodes), offsets, ndofs_cell)
end


function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern)
K = Adapt.adapt_structure(to, assembler.K)
f = Adapt.adapt_structure(to, assembler.f)
Ferrite.GPUAssemblerSparsityPattern(K, f)
return Ferrite.GPUAssemblerSparsityPattern(K, f)
end
14 changes: 7 additions & 7 deletions ext/GPU/cuda_iterator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Create `CUDACellIterator` object for each thread with local id `thread_id` in or
on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
The elements of the iterator are `GPUCellCache` objects.
"""
struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
struct CUDACellIterator{DH <: Ferrite.AbstractGPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
dh::DH # TODO: subdofhandlers are not supported yet.
grid::GRID
n_cells::Int32
Expand All @@ -33,10 +33,10 @@ function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::In
grid = get_grid(dh)
n_cells = grid |> getncells |> Int32
bd = blockDim().x
ke_shared =@cuDynamicSharedMem(Float32, (bd, n_basefuncs, n_basefuncs))
ke_shared = @cuDynamicSharedMem(Float32, (bd, n_basefuncs, n_basefuncs))
fe_shared = @cuDynamicSharedMem(Float32, (bd, n_basefuncs), sizeof(Float32) * bd * n_basefuncs * n_basefuncs)
local_thread_id = threadIdx().x
CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
return CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
end

"""
Expand Down Expand Up @@ -85,7 +85,7 @@ Arguments:
- `ke`: View into shared memory for the cell's stiffness matrix.
- `fe`: View into shared memory for the cell's force vector.
"""
struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
struct GPUCellCache{DOFS <: AbstractVector{Int32}, NN, NODES <: SVector{NN, Int32}, X, COORDS <: SVector{X}, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
coords::COORDS
dofs::DOFS
cellid::Int32
Expand Down Expand Up @@ -117,7 +117,7 @@ function _makecache(iterator::CUDACellIterator, e::Int32)
coords = SVector(x...)

# Return the GPUCellCache containing the cell's data.
return GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
return GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
end

"""
Expand Down Expand Up @@ -185,7 +185,7 @@ Returns:
"""
@inline function Ferrite.cellke(cc::GPUCellCache)
ke = cc.ke
fill!(ke, 0.0f0)
return fill!(ke, 0.0f0)
end

"""
Expand All @@ -201,5 +201,5 @@ Returns:
"""
@inline function Ferrite.cellfe(cc::GPUCellCache)
fe = cc.fe
fill!(fe, 0.0f0)
return fill!(fe, 0.0f0)
end
18 changes: 10 additions & 8 deletions ext/GPU/gpu_assembler.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
## GPU Assembler ###
### First abstract types and interfaces ###

abstract type AbstractGPUSparseAssembler{Tv,Ti} end
abstract type AbstractGPUSparseAssembler{Tv, Ti} end

function Ferrite.assemble!(A::AbstractGPUSparseAssembler, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
throw(NotImplementedError("A concrete implementation of assemble! is required"))
end


struct GPUAssemblerSparsityPattern{Tv,Ti,VEC_FLOAT<:AbstractVector{Tv},SPARSE_MAT<:AbstractSparseArray{Tv,Ti}} <: AbstractGPUSparseAssembler{Tv,Ti}
struct GPUAssemblerSparsityPattern{Tv, Ti, VEC_FLOAT <: AbstractVector{Tv}, SPARSE_MAT <: AbstractSparseArray{Tv, Ti}} <: AbstractGPUSparseAssembler{Tv, Ti}
K::SPARSE_MAT
f::VEC_FLOAT
end

function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti}, f::CuDeviceVector{Tv}) where {Tv,Ti}
function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}, f::CuDeviceVector{Tv}) where {Tv, Ti}
return GPUAssemblerSparsityPattern(K, f)
end

Expand All @@ -26,26 +26,27 @@ Assembles the global stiffness matrix `Ke` and the global force vector `fe` into
"""
@propagate_inbounds function Ferrite.assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
# Note: MATRIX and VECTOR are cuda dynamic shared memory
_assemble!(A, dofs, Ke, fe)
return _assemble!(A, dofs, Ke, fe)
end


function _assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
# # Brute force assembly
K = A.K
f = A.f
for i = 1:length(dofs)
for i in 1:length(dofs)
ig = dofs[i]
CUDA.@atomic f[ig] += fe[i]
for j = 1:length(dofs)
for j in 1:length(dofs)
jg = dofs[j]
# set the value of the global matrix
_add_to_index!(K, Ke[i,j], ig, jg)
_add_to_index!(K, Ke[i, j], ig, jg)
end
end
return
end

@inline function _add_to_index!(K::AbstractSparseArray{Tv,Ti}, v::Tv, i::Int32, j::Int32) where {Tv,Ti}
@inline function _add_to_index!(K::AbstractSparseArray{Tv, Ti}, v::Tv, i::Int32, j::Int32) where {Tv, Ti}
col_start = K.colPtr[j]
col_end = K.colPtr[j + Int32(1)] - Int32(1)

Expand All @@ -56,4 +57,5 @@ end
return
end
end
return
end
Loading

0 comments on commit b7301c2

Please sign in to comment.