precommit fix

Ferrite-FEM · Nov 6, 2024 · b7301c2 · b7301c2
1 parent 6300a4a
commit b7301c2
Show file tree

Hide file tree

Showing 22 changed files with 177 additions and 191 deletions.
diff --git a/docs/src/literate-tutorials/gpu_qp_heat_equation.jl b/docs/src/literate-tutorials/gpu_qp_heat_equation.jl
@@ -3,27 +3,21 @@ using StaticArrays
 using SparseArrays
 
 
+left = Tensor{1, 2, Float32}((0, -0)) # define the left bottom corner of the grid.
 
+right = Tensor{1, 2, Float32}((100.0, 100.0)) # define the right top corner of the grid.
 
 
-
-left = Tensor{1,2,Float32}((0,-0)) # define the left bottom corner of the grid.
-
-right = Tensor{1,2,Float32}((100.0,100.0)) # define the right top corner of the grid.
-
-
-grid = generate_grid(Quadrilateral, (100, 100),left,right)
+grid = generate_grid(Quadrilateral, (100, 100), left, right)
 
 
 ip = Lagrange{RefQuadrilateral, 1}() # define the interpolation function (i.e. Bilinear lagrange)
 
 
-
-qr = QuadratureRule{RefQuadrilateral}(Float32,2)
+qr = QuadratureRule{RefQuadrilateral}(Float32, 2)
 
 
-
-cellvalues = CellValues(Float32,qr, ip)
+cellvalues = CellValues(Float32, qr, ip)
 
 
 dh = DofHandler(grid)
@@ -43,7 +37,7 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
         dΩ = getdetJdV(cellvalues, q_point)
         ## Loop over test shape functions
         for i in 1:n_basefuncs
-            δu  = shape_value(cellvalues, q_point, i)
+            δu = shape_value(cellvalues, q_point, i)
             ∇δu = shape_gradient(cellvalues, q_point, i)
             ## Add contribution to fe
             fe[i] += δu * dΩ
@@ -59,7 +53,6 @@ function assemble_element_std!(Ke::Matrix, fe::Vector, cellvalues::CellValues)
 end
 
 
-
 function create_buffers(cellvalues, dh)
     f = zeros(ndofs(dh))
     K = allocate_matrix(dh)
@@ -68,21 +61,21 @@ function create_buffers(cellvalues, dh)
     n_basefuncs = getnbasefunctions(cellvalues)
     Ke = zeros(n_basefuncs, n_basefuncs)
     fe = zeros(n_basefuncs)
-    return (;f, K, assembler, Ke, fe)
+    return (; f, K, assembler, Ke, fe)
 end
 
 
 # Standard global assembly
 
-function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where {QPiter}
-    (;f, K, assembler, Ke, fe) = create_buffers(cellvalues,dh)
+function assemble_global!(cellvalues, dh::DofHandler, qp_iter::Val{QPiter}) where {QPiter}
+    (; f, K, assembler, Ke, fe) = create_buffers(cellvalues, dh)
     ## Loop over all cels
     for cell in CellIterator(dh)
         fill!(Ke, 0)
         fill!(fe, 0)
         if QPiter
             ## reinit!(cellvalues, getcoordinates(cell))
-            assemble_element_qpiter!(Ke, fe, cellvalues,getcoordinates(cell))
+            assemble_element_qpiter!(Ke, fe, cellvalues, getcoordinates(cell))
         else
             ## Reinitialize cellvalues for this cell
             reinit!(cellvalues, cell)
@@ -96,18 +89,17 @@ function assemble_global!(cellvalues, dh::DofHandler,qp_iter::Val{QPiter}) where
 end
 
 
-
 ## gpu version of element assembly
 
 
-function assemble_element!(Ke,fe,cv,cell)
+function assemble_element!(Ke, fe, cv, cell)
     n_basefuncs = getnbasefunctions(cv)
-    for qv in Ferrite.QuadratureValuesIterator(cv,getcoordinates(cell))
+    for qv in Ferrite.QuadratureValuesIterator(cv, getcoordinates(cell))
         ## Get the quadrature weight
         dΩ = getdetJdV(qv)
         ## Loop over test shape functions
         for i in 1:n_basefuncs
-            δu  = shape_value(qv, i)
+            δu = shape_value(qv, i)
             ∇u = shape_gradient(qv, i)
             ## Add contribution to fe
             fe[i] += δu * dΩ
@@ -116,18 +108,19 @@ function assemble_element!(Ke,fe,cv,cell)
             for j in 1:n_basefuncs
                 ∇δu = shape_gradient(qv, j)
                 ## Add contribution to Ke
-                Ke[i,j] += (∇δu ⋅ ∇u) * dΩ
+                Ke[i, j] += (∇δu ⋅ ∇u) * dΩ
             end
         end
     end
+    return
 end
 
 
 # gpu version of global assembly
-function assemble_gpu!(Kgpu,fgpu, cv, dh)
+function assemble_gpu!(Kgpu, fgpu, cv, dh)
     n_basefuncs = getnbasefunctions(cv)
-    assembler = start_assemble(Kgpu, fgpu;fillzero=false)
-    for cell in CellIterator(dh, convert(Int32,n_basefuncs))
+    assembler = start_assemble(Kgpu, fgpu; fillzero = false)
+    for cell in CellIterator(dh, convert(Int32, n_basefuncs))
         Ke = cellke(cell)
         fe = cellfe(cell)
         assemble_element!(Ke, fe, cv, cell)
@@ -142,7 +135,7 @@ n_basefuncs = getnbasefunctions(cellvalues)
 ## Allocate CPU matrix
 ## K = allocate_matrix(SparseMatrixCSC{Float32, Int32},dh);
 
-K = allocate_matrix(SparseMatrixCSC{Float64, Int64},dh);
+K = allocate_matrix(SparseMatrixCSC{Float64, Int64}, dh);
 f = zeros(ndofs(dh));
 
 # Allocate GPU matrix
@@ -155,13 +148,13 @@ n_cells = dh |> get_grid |> getncells
 # Kernel configuration
 ## commented to pass the test
 ##init_kernel(BackendCUDA,n_cells,n_basefuncs,assemble_gpu!, (Kgpu,fgpu, cellvalues, dh)) |> launch!
-cpu_kernel = init_kernel(BackendCPU,n_cells,n_basefuncs,assemble_gpu!, (K,f, cellvalues, dh));
+cpu_kernel = init_kernel(BackendCPU, n_cells, n_basefuncs, assemble_gpu!, (K, f, cellvalues, dh));
 cpu_kernel()
 
-stassy(cv,dh) = assemble_global!(cv,dh,Val(false))
+stassy(cv, dh) = assemble_global!(cv, dh, Val(false))
 
 norm(K)
 ## commented to pass the test
 ## norm(Kgpu)
-Kstd , Fstd = stassy(cellvalues,dh);
+Kstd, Fstd = stassy(cellvalues, dh);
 norm(Kstd)
diff --git a/ext/GPU/CUDAKernelLauncher.jl b/ext/GPU/CUDAKernelLauncher.jl
@@ -1,6 +1,4 @@
-
-
-function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti<: Integer}
+function Ferrite.init_kernel(::Type{BackendCUDA}, n_cells::Ti, n_basefuncs::Ti, kernel::Function, args::Tuple) where {Ti <: Integer}
     if CUDA.functional()
         return LazyKernel(n_cells, n_basefuncs, kernel, args, BackendCUDA)
     else
@@ -17,22 +15,22 @@ Launch a CUDA kernel with the given configuration.
 Arguments:
 - `kernel_config`: The `CUDAKernelLauncher` object containing a higher level fields for kernel configuration.
 """
-function Ferrite.launch!(kernel::LazyKernel{Ti,BackendCUDA}) where Ti
+function Ferrite.launch!(kernel::LazyKernel{Ti, BackendCUDA}) where {Ti}
     n_cells = kernel.n_cells
     n_basefuncs = kernel.n_basefuncs
     ker = kernel.kernel
     args = kernel.args
-    kernel = @cuda launch=false ker(args...)
+    kernel = @cuda launch = false ker(args...)
     config = launch_configuration(kernel.fun)
-    threads = convert(Ti,min(n_cells, config.threads,256))
-    shared_mem = _calculate_shared_memory(threads ,n_basefuncs)
+    threads = convert(Ti, min(n_cells, config.threads, 256))
+    shared_mem = _calculate_shared_memory(threads, n_basefuncs)
     blocks = _calculate_nblocks(threads, n_cells)
-    kernel(args...; threads, blocks, shmem=shared_mem)
+    return kernel(args...; threads, blocks, shmem = shared_mem)
 end
 
 
 function _calculate_shared_memory(threads::Integer, n_basefuncs::Integer)
-    return sizeof(Float32) * (threads) * ( n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
+    return sizeof(Float32) * (threads) * (n_basefuncs) * n_basefuncs + sizeof(Float32) * (threads) * n_basefuncs
 end
 
 

diff --git a/ext/GPU/adapt.jl b/ext/GPU/adapt.jl
@@ -1,54 +1,51 @@
 # This file defines the adapt_structure function, which is used to adapt custom structures to be used on the GPU.
 
-function Adapt.adapt_structure(to,cv::CellValues)
-    fv = Adapt.adapt(to,StaticInterpolationValues(cv.fun_values))
-    gm =Adapt.adapt(to,StaticInterpolationValues(cv.geo_mapping))
-    weights =Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)))
-    Ferrite.StaticCellValues(fv,gm, weights)
+function Adapt.adapt_structure(to, cv::CellValues)
+    fv = Adapt.adapt(to, StaticInterpolationValues(cv.fun_values))
+    gm = Adapt.adapt(to, StaticInterpolationValues(cv.geo_mapping))
+    weights = Adapt.adapt(to, ntuple(i -> getweights(cv.qr)[i], getnquadpoints(cv)))
+    return Ferrite.StaticCellValues(fv, gm, weights)
 end
 
 
 function Adapt.adapt_structure(to, iter::Ferrite.QuadratureValuesIterator)
     cv = Adapt.adapt_structure(to, iter.v)
     cell_coords = Adapt.adapt_structure(to, iter.cell_coords)
-    Ferrite.QuadratureValuesIterator(cv,cell_coords)
+    return Ferrite.QuadratureValuesIterator(cv, cell_coords)
 end
 
 function Adapt.adapt_structure(to, qv::Ferrite.StaticQuadratureValues)
     det = Adapt.adapt_structure(to, qv.detJdV)
     N = Adapt.adapt_structure(to, qv.N)
     dNdx = Adapt.adapt_structure(to, qv.dNdx)
     M = Adapt.adapt_structure(to, qv.M)
-    Ferrite.StaticQuadratureValues(det,N,dNdx,M)
+    return Ferrite.StaticQuadratureValues(det, N, dNdx, M)
 end
 function Adapt.adapt_structure(to, qv::StaticQuadratureView)
     mapping = Adapt.adapt_structure(to, qv.mapping)
     cell_coords = Adapt.adapt_structure(to, qv.cell_coords |> cu)
     q_point = Adapt.adapt_structure(to, qv.q_point)
     cv = Adapt.adapt_structure(to, qv.cv)
-    StaticQuadratureView(mapping,cell_coords,q_point,cv)
+    return StaticQuadratureView(mapping, cell_coords, q_point, cv)
 end
 
 function Adapt.adapt_structure(to, grid::Grid)
     # map Int64 to Int32 to reduce number of registers
     cu_cells = grid.cells .|> (x -> Int32.(x.nodes)) .|> Quadrilateral |> cu
     cells = Adapt.adapt_structure(to, cu_cells)
     nodes = Adapt.adapt_structure(to, cu(grid.nodes))
-    GPUGrid(cells,nodes)
+    return GPUGrid(cells, nodes)
 end
 
 
-function Adapt.adapt_structure(to,iterator::CUDACellIterator)
+function Adapt.adapt_structure(to, iterator::CUDACellIterator)
     grid = Adapt.adapt_structure(to, iterator.grid)
     dh = Adapt.adapt_structure(to, iterator.dh)
     ncells = Adapt.adapt_structure(to, iterator.n_cells)
-    GPUCellIterator(dh,grid, ncells)
+    return GPUCellIterator(dh, grid, ncells)
 end
 
 
-
-
-
 function _get_ndofs_cell(dh::DofHandler)
     ndofs_cell = [Int32(Ferrite.ndofs_per_cell(dh, i)) for i in 1:(dh |> Ferrite.get_grid |> Ferrite.getncells)]
     return ndofs_cell
@@ -69,12 +66,12 @@ function Adapt.adapt_structure(to, dh::DofHandler)
     offsets = Adapt.adapt_structure(to, dh.cell_dofs_offset .|> Int32 |> cu)
     nodes = Adapt.adapt_structure(to, dh.grid.nodes |> cu)
     ndofs_cell = Adapt.adapt_structure(to, _get_ndofs_cell(dh) |> cu)
-    GPUDofHandler(cell_dofs, GPUGrid(cells,nodes),offsets, ndofs_cell)
+    return GPUDofHandler(cell_dofs, GPUGrid(cells, nodes), offsets, ndofs_cell)
 end
 
 
 function Adapt.adapt_structure(to, assembler::GPUAssemblerSparsityPattern)
     K = Adapt.adapt_structure(to, assembler.K)
     f = Adapt.adapt_structure(to, assembler.f)
-    Ferrite.GPUAssemblerSparsityPattern(K, f)
+    return Ferrite.GPUAssemblerSparsityPattern(K, f)
 end
diff --git a/ext/GPU/cuda_iterator.jl b/ext/GPU/cuda_iterator.jl
@@ -7,7 +7,7 @@ Create `CUDACellIterator` object for each thread with local id `thread_id` in or
 on the GPU and these elements are associated with the thread based on a stride = `blockDim().x * gridDim().x`.
 The elements of the iterator are `GPUCellCache` objects.
 """
-struct CUDACellIterator{DH<:Ferrite.AbstractGPUDofHandler,GRID<: Ferrite.AbstractGPUGrid,KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
+struct CUDACellIterator{DH <: Ferrite.AbstractGPUDofHandler, GRID <: Ferrite.AbstractGPUGrid, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellIterator
     dh::DH # TODO: subdofhandlers are not supported yet.
     grid::GRID
     n_cells::Int32
@@ -33,10 +33,10 @@ function Ferrite.CellIterator(dh::Ferrite.AbstractGPUDofHandler, n_basefuncs::In
     grid = get_grid(dh)
     n_cells = grid |> getncells |> Int32
     bd = blockDim().x
-    ke_shared =@cuDynamicSharedMem(Float32, (bd, n_basefuncs, n_basefuncs))
+    ke_shared = @cuDynamicSharedMem(Float32, (bd, n_basefuncs, n_basefuncs))
     fe_shared = @cuDynamicSharedMem(Float32, (bd, n_basefuncs), sizeof(Float32) * bd * n_basefuncs * n_basefuncs)
     local_thread_id = threadIdx().x
-    CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
+    return CUDACellIterator(dh, grid, n_cells, ke_shared, fe_shared, local_thread_id)
 end
 
 """
@@ -85,7 +85,7 @@ Arguments:
 - `ke`: View into shared memory for the cell's stiffness matrix.
 - `fe`: View into shared memory for the cell's force vector.
 """
-struct GPUCellCache{DOFS <: AbstractVector{Int32},NN,NODES <: SVector{NN,Int32},X, COORDS<: SVector{X},KDynamicSharedMem,FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
+struct GPUCellCache{DOFS <: AbstractVector{Int32}, NN, NODES <: SVector{NN, Int32}, X, COORDS <: SVector{X}, KDynamicSharedMem, FDynamicSharedMem} <: Ferrite.AbstractKernelCellCache
     coords::COORDS
     dofs::DOFS
     cellid::Int32
@@ -117,7 +117,7 @@ function _makecache(iterator::CUDACellIterator, e::Int32)
     coords = SVector(x...)
 
     # Return the GPUCellCache containing the cell's data.
-    return  GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
+    return GPUCellCache(coords, dofs, cellid, nodes, (@view iterator.block_ke[iterator.thread_id, :, :]), (@view iterator.block_fe[iterator.thread_id, :, :]))
 end
 
 """
@@ -185,7 +185,7 @@ Returns:
 """
 @inline function Ferrite.cellke(cc::GPUCellCache)
     ke = cc.ke
-    fill!(ke, 0.0f0)
+    return fill!(ke, 0.0f0)
 end
 
 """
@@ -201,5 +201,5 @@ Returns:
 """
 @inline function Ferrite.cellfe(cc::GPUCellCache)
     fe = cc.fe
-    fill!(fe, 0.0f0)
+    return fill!(fe, 0.0f0)
 end
diff --git a/ext/GPU/gpu_assembler.jl b/ext/GPU/gpu_assembler.jl
@@ -1,19 +1,19 @@
 ## GPU Assembler ###
 ### First abstract types and interfaces ###
 
-abstract type AbstractGPUSparseAssembler{Tv,Ti} end
+abstract type AbstractGPUSparseAssembler{Tv, Ti} end
 
 function Ferrite.assemble!(A::AbstractGPUSparseAssembler, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
     throw(NotImplementedError("A concrete implementation of assemble! is required"))
 end
 
 
-struct GPUAssemblerSparsityPattern{Tv,Ti,VEC_FLOAT<:AbstractVector{Tv},SPARSE_MAT<:AbstractSparseArray{Tv,Ti}} <: AbstractGPUSparseAssembler{Tv,Ti}
+struct GPUAssemblerSparsityPattern{Tv, Ti, VEC_FLOAT <: AbstractVector{Tv}, SPARSE_MAT <: AbstractSparseArray{Tv, Ti}} <: AbstractGPUSparseAssembler{Tv, Ti}
     K::SPARSE_MAT
     f::VEC_FLOAT
 end
 
-function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv,Ti}, f::CuDeviceVector{Tv}) where {Tv,Ti}
+function Ferrite.start_assemble(K::CUSPARSE.CuSparseDeviceMatrixCSC{Tv, Ti}, f::CuDeviceVector{Tv}) where {Tv, Ti}
     return GPUAssemblerSparsityPattern(K, f)
 end
 
@@ -26,26 +26,27 @@ Assembles the global stiffness matrix `Ke` and the global force vector `fe` into
 """
 @propagate_inbounds function Ferrite.assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
     # Note: MATRIX and VECTOR are cuda dynamic shared memory
-    _assemble!(A, dofs, Ke, fe)
+    return _assemble!(A, dofs, Ke, fe)
 end
 
 
 function _assemble!(A::GPUAssemblerSparsityPattern, dofs::AbstractVector{Int32}, Ke::MATRIX, fe::VECTOR) where {MATRIX, VECTOR}
     # # Brute force assembly
     K = A.K
     f = A.f
-    for i = 1:length(dofs)
+    for i in 1:length(dofs)
         ig = dofs[i]
         CUDA.@atomic f[ig] += fe[i]
-        for j = 1:length(dofs)
+        for j in 1:length(dofs)
             jg = dofs[j]
             # set the value of the global matrix
-            _add_to_index!(K, Ke[i,j], ig, jg)
+            _add_to_index!(K, Ke[i, j], ig, jg)
         end
     end
+    return
 end
 
-@inline function _add_to_index!(K::AbstractSparseArray{Tv,Ti}, v::Tv, i::Int32, j::Int32) where {Tv,Ti}
+@inline function _add_to_index!(K::AbstractSparseArray{Tv, Ti}, v::Tv, i::Int32, j::Int32) where {Tv, Ti}
     col_start = K.colPtr[j]
     col_end = K.colPtr[j + Int32(1)] - Int32(1)
 
@@ -56,4 +57,5 @@ end
             return
         end
     end
+    return
 end