From 65aabfde4ff8a435fc6534abe63df067f31ddd7c Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 22 Feb 2023 01:01:03 -0500 Subject: [PATCH 01/18] Adding copyto for non-contigous matrices and vectors --- src/array.jl | 97 +++++++++++++++++++++++++++++++++++++++++++++++++-- test/array.jl | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 2 deletions(-) diff --git a/src/array.jl b/src/array.jl index 652792becf..6028f5c163 100644 --- a/src/array.jl +++ b/src/array.jl @@ -45,7 +45,7 @@ function explain_eltype(@nospecialize(T), depth=0; maxdepth=10) msg = " "^depth * "$T is a mutable type\n" elseif hasfieldcount(T) msg = " "^depth * "$T is a struct that's not allocated inline\n" - for U in fieldtypes(T) + for U in fieldtypes(dt) if !Base.allocatedinline(U) msg *= explain_nonisbits(U, depth+1) end @@ -300,7 +300,8 @@ end export DenseCuArray, DenseCuVector, DenseCuMatrix, DenseCuVecOrMat, StridedCuArray, StridedCuVector, StridedCuMatrix, StridedCuVecOrMat, - AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat + AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat, + StridedGeneralArray, StridedGeneralMatrix, StridedGeneralVector # dense arrays: stored contiguously in memory # @@ -324,6 +325,11 @@ const StridedCuVector{T} = StridedCuArray{T,1} const StridedCuMatrix{T} = StridedCuArray{T,2} const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}} +#union of strided CuArrays and Arrays +const StridedGeneralArray{T,N} = Union{StridedCuArray{T,N}, StridedArray{T,N}} +const StridedGeneralVector{T} = StridedGeneralArray{T,1} +const StridedGeneralMatrix{T} = StridedGeneralArray{T,2} + Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x) @inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i) @@ -431,6 +437,93 @@ end Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} = copyto!(dest, 1, src, 1, length(src)) + +#TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination +#TO DO: add parameters doffs, soffs, n +function copyto_views!(dest::StridedGeneralMatrix{T},src::StridedGeneralMatrix{T}, + dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl + src_step_x=step(src.indices[1]) + dest_step_x=step(dest.indices[1]) + src_step_height=step(src.indices[2]) + dest_step_height=step(dest.indices[2]) + src_parent_size=size(parent(src)) + dest_parent_size=size(parent(dest)) + + @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2])) + @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1])) + + #Non-contigous views can be accomodated by copy3d in certain cases + if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) + Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location, + 1, size(src,1), size(src,2); + srcPos=(1,1,1), dstPos=(1,1,1), + srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x), + dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x)) + #In other cases, use parallel threads + else + CUDA.synchronize() + @sync for col in 1:length(src.indices[2]) + Threads.@spawn begin + println(" Thread index "*string(col)) + Mem.unsafe_copy3d!(pointer(view(dest,:,col)), dest_location, pointer(view(src,:,col)), src_location, + 1, 1, size(src,1); + srcPos=(1,1,1), dstPos=(1,1,1), + srcPitch=sizeof(T)*src_step_x,srcHeight=1, + dstPitch=sizeof(T)*dest_step_x, dstHeight=1) + CUDA.synchronize() + end + end + end + return dest +end + +Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = + copyto_views!(dest,src,Mem.Device,Mem.Host) + +Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = + copyto_views!(dest,src,Mem.Host,Mem.Device) + +Base.copyto!(dest::StridedCuMatrix{T}, src::StridedCuMatrix{T} ) where {T} = + copyto_views!(dest,src,Mem.Device,Mem.Device) + +function copyto_views!(dest::StridedGeneralVector{T},doffs::Integer,src::StridedGeneralVector{T}, soffs::Integer, + n::Integer,dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl + n==0 && return dest + @boundscheck checkbounds(dest, doffs) + @boundscheck checkbounds(dest, doffs+n-1) + @boundscheck checkbounds(src, soffs) + @boundscheck checkbounds(src, soffs+n-1) + + src_step=step(src.indices) + dest_step=step(dest.indices) + + Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location, + 1, n, 1; + srcPos=(1,soffs,1), dstPos=(1,doffs,1), + srcPitch=src_step*sizeof(T),srcHeight=1, + dstPitch=dest_step*sizeof(T), dstHeight=1) + return dest +end + +Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedVector{T}, soffs::Integer, n::Integer) where {T} = + copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Host) + +Base.copyto!(dest::StridedVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = + copyto_views!(dest,doffs,src,soffs,n,Mem.Host,Mem.Device) + +Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = + copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Device) + +Base.copyto!(dest::StridedCuArray{T}, src::StridedArray{T}) where {T} = + copyto!(dest, 1, src, 1, length(src)) + +Base.copyto!(dest::StridedArray{T}, src::StridedCuArray{T}) where {T} = +copyto!(dest, 1, src, 1, length(src)) + +Base.copyto!(dest::StridedCuArray{T}, src::StridedCuArray{T}) where {T} = +copyto!(dest, 1, src, 1, length(src)) + + # general case: use CUDA APIs # NOTE: we only switch contexts here to avoid illegal memory accesses. synchronization is diff --git a/test/array.jl b/test/array.jl index 8ef7cf779f..0438911258 100644 --- a/test/array.jl +++ b/test/array.jl @@ -315,6 +315,103 @@ end @test view(b, :, 1, :) isa StridedCuArray end +@testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64] + @testset "copyto StridedCuArray" begin + n=17 + m=11 + k=23 + l=19 + + #From GPU to CPU + gpu_matrix = CUDA.rand(elty, m,n) + cpu_matrix = rand(elty,l,k) + gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) + cpu_view= view(cpu_matrix,1:5:16, 4:4:20) + copyto!(cpu_view,gpu_view) + @test collect(gpu_view) == cpu_view + + gpu_matrix = CUDA.rand(elty, m,n) + cpu_matrix = rand(elty,l,k) + gpu_view= view(gpu_matrix, :, :) + cpu_view= view(cpu_matrix,1:m, 1:n) + copyto!(cpu_view,gpu_view) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + cpu_vec = rand(elty,l) + gpu_view= view(gpu_vec, 2:3:11) + cpu_view= view(cpu_vec,1:5:16) + copyto!(cpu_view,gpu_view) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + cpu_vec = rand(elty,l) + gpu_view= view(gpu_vec, :) + cpu_view= view(cpu_vec,1:m) + copyto!(cpu_view,gpu_view) + @test collect(gpu_view) == cpu_view + + #From CPU to GPU + gpu_matrix = CUDA.rand(elty, m,n) + cpu_matrix = rand(elty,l,k) + gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) + cpu_view= view(cpu_matrix,1:5:16, 4:4:20) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + gpu_matrix = CUDA.rand(elty, m,n) + cpu_matrix = rand(elty,l,k) + gpu_view= view(gpu_matrix, :, :) + cpu_view= view(cpu_matrix,1:m, 1:n) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + cpu_vec = rand(elty,l) + gpu_view= view(gpu_vec, 2:3:11) + cpu_view= view(cpu_vec,1:5:16) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + cpu_vec = rand(elty,l) + gpu_view= view(gpu_vec, :) + cpu_view= view(cpu_vec,1:m) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + #From GPU to GPU + gpu_matrix = CUDA.rand(elty, m,n) + gpu_matrix2 = CUDA.rand(elty,l,k) + gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) + gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20) + copyto!(gpu_view,gpu_view2) + @test collect(gpu_view) == cpu_view + + gpu_matrix = CUDA.rand(elty, m,n) + gpu_matrix2 = CUDA.rand(elty,l,k) + gpu_view= view(gpu_matrix,:, :) + gpu_view2= view(gpu_matrix2,1:m, 1:n) + copyto!(gpu_view,gpu_view2) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + gpu_vec2 = CUDA.rand(elty,l) + gpu_view= view(gpu_vec, 2:3:11) + gpu_view2= view(gpu_vec2,1:5:16) + copyto!(gpu_view,gpu_view2) + @test collect(gpu_view) == cpu_view + + gpu_vec = CUDA.rand(elty, m) + gpu_vec2 = CUDA.rand(elty,l) + gpu_view= view(gpu_vec, :) + gpu_view2= view(gpu_vec2,1:m) + copyto!(gpu_view,gpu_view2) + @test collect(gpu_view) == cpu_view + + end +end + @testset "accumulate" begin for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not @test testf(x->accumulate(+, x), rand(n)) From e1f6c1b06f422c1f685da1ea6fb36bfb1a3c9e58 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 22 Feb 2023 01:36:04 -0500 Subject: [PATCH 02/18] Removing typo --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 6028f5c163..348892445d 100644 --- a/src/array.jl +++ b/src/array.jl @@ -45,7 +45,7 @@ function explain_eltype(@nospecialize(T), depth=0; maxdepth=10) msg = " "^depth * "$T is a mutable type\n" elseif hasfieldcount(T) msg = " "^depth * "$T is a struct that's not allocated inline\n" - for U in fieldtypes(dt) + for U in fieldtypes(T) if !Base.allocatedinline(U) msg *= explain_nonisbits(U, depth+1) end From d3e93a73ef5528ce49263f8adb7c303f2f10546e Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 22 Feb 2023 17:39:55 -0500 Subject: [PATCH 03/18] Resolving weird syntax that caused compilation error --- src/array.jl | 139 +++++++++++++++++++++------------------------------ 1 file changed, 57 insertions(+), 82 deletions(-) diff --git a/src/array.jl b/src/array.jl index 348892445d..84676f0877 100644 --- a/src/array.jl +++ b/src/array.jl @@ -300,8 +300,7 @@ end export DenseCuArray, DenseCuVector, DenseCuMatrix, DenseCuVecOrMat, StridedCuArray, StridedCuVector, StridedCuMatrix, StridedCuVecOrMat, - AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat, - StridedGeneralArray, StridedGeneralMatrix, StridedGeneralVector + AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat # dense arrays: stored contiguously in memory # @@ -325,11 +324,6 @@ const StridedCuVector{T} = StridedCuArray{T,1} const StridedCuMatrix{T} = StridedCuArray{T,2} const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}} -#union of strided CuArrays and Arrays -const StridedGeneralArray{T,N} = Union{StridedCuArray{T,N}, StridedArray{T,N}} -const StridedGeneralVector{T} = StridedGeneralArray{T,1} -const StridedGeneralMatrix{T} = StridedGeneralArray{T,2} - Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x) @inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i) @@ -435,94 +429,75 @@ function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::DenseCuArray{T end Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} = - copyto!(dest, 1, src, 1, length(src)) - +copyto!(dest, 1, src, 1, length(src)) #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination #TO DO: add parameters doffs, soffs, n -function copyto_views!(dest::StridedGeneralMatrix{T},src::StridedGeneralMatrix{T}, - dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl - src_step_x=step(src.indices[1]) - dest_step_x=step(dest.indices[1]) - src_step_height=step(src.indices[2]) - dest_step_height=step(dest.indices[2]) - src_parent_size=size(parent(src)) - dest_parent_size=size(parent(dest)) - - @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2])) - @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1])) - - #Non-contigous views can be accomodated by copy3d in certain cases - if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) - Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location, - 1, size(src,1), size(src,2); - srcPos=(1,1,1), dstPos=(1,1,1), - srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x), - dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x)) - #In other cases, use parallel threads - else - CUDA.synchronize() - @sync for col in 1:length(src.indices[2]) - Threads.@spawn begin - println(" Thread index "*string(col)) - Mem.unsafe_copy3d!(pointer(view(dest,:,col)), dest_location, pointer(view(src,:,col)), src_location, - 1, 1, size(src,1); - srcPos=(1,1,1), dstPos=(1,1,1), - srcPitch=sizeof(T)*src_step_x,srcHeight=1, - dstPitch=sizeof(T)*dest_step_x, dstHeight=1) + +for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArray, Mem.Device,Mem.Host) , + (SubArray, StridedSubCuArray, Mem.Host,Mem.Device), + (StridedSubCuArray, StridedSubCuArray, Mem.Device,Mem.Device) ) + @eval begin + function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} + src_step_x=step(src.indices[1]) + dest_step_x=step(dest.indices[1]) + src_step_height=step(src.indices[2]) + dest_step_height=step(dest.indices[2]) + src_parent_size=size(parent(src)) + dest_parent_size=size(parent(dest)) + + @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2])) + @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1])) + + #Non-contigous views can be accomodated by copy3d in certain cases + if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) + Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, + 1, size(src,1), size(src,2); + srcPos=(1,1,1), dstPos=(1,1,1), + srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x), + dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x)) + #In other cases, use parallel threads + else CUDA.synchronize() + #@sync + for col in 1:length(src.indices[2]) + #Threads.@spawn begin + Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, + 1, 1, size(src,1); + srcPos=(1,1,1), dstPos=(1,1,1), + srcPitch=sizeof(T)*src_step_x,srcHeight=1, + dstPitch=sizeof(T)*dest_step_x, dstHeight=1) + CUDA.synchronize() + #end + end end + return dest end - end - return dest -end -Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = - copyto_views!(dest,src,Mem.Device,Mem.Host) - -Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = - copyto_views!(dest,src,Mem.Host,Mem.Device) + function copyto_views!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2}, soffs::Integer, + n::Integer) where {T,args1,args2} + n==0 && return dest + @boundscheck checkbounds(dest, doffs) + @boundscheck checkbounds(dest, doffs+n-1) + @boundscheck checkbounds(src, soffs) + @boundscheck checkbounds(src, soffs+n-1) + + Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, + 1, n, 1; + srcPos=(1,soffs,1), dstPos=(1,doffs,1), + srcPitch=src_step*sizeof(T),srcHeight=1, + dstPitch=dest_step*sizeof(T), dstHeight=1) + return dest + end -Base.copyto!(dest::StridedCuMatrix{T}, src::StridedCuMatrix{T} ) where {T} = - copyto_views!(dest,src,Mem.Device,Mem.Device) -function copyto_views!(dest::StridedGeneralVector{T},doffs::Integer,src::StridedGeneralVector{T}, soffs::Integer, - n::Integer,dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl - n==0 && return dest - @boundscheck checkbounds(dest, doffs) - @boundscheck checkbounds(dest, doffs+n-1) - @boundscheck checkbounds(src, soffs) - @boundscheck checkbounds(src, soffs+n-1) - src_step=step(src.indices) - dest_step=step(dest.indices) + Base.copyto!(dest::$destType{T}, src::$srcType{T}) where {T} = + copyto!(dest, 1, src, 1, length(src)) - Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location, - 1, n, 1; - srcPos=(1,soffs,1), dstPos=(1,doffs,1), - srcPitch=src_step*sizeof(T),srcHeight=1, - dstPitch=dest_step*sizeof(T), dstHeight=1) - return dest + end end -Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedVector{T}, soffs::Integer, n::Integer) where {T} = - copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Host) - -Base.copyto!(dest::StridedVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = - copyto_views!(dest,doffs,src,soffs,n,Mem.Host,Mem.Device) - -Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = - copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Device) - -Base.copyto!(dest::StridedCuArray{T}, src::StridedArray{T}) where {T} = - copyto!(dest, 1, src, 1, length(src)) - -Base.copyto!(dest::StridedArray{T}, src::StridedCuArray{T}) where {T} = -copyto!(dest, 1, src, 1, length(src)) - -Base.copyto!(dest::StridedCuArray{T}, src::StridedCuArray{T}) where {T} = -copyto!(dest, 1, src, 1, length(src)) - # general case: use CUDA APIs From f2f4d1773d657e228eeb2e4e59a542ba78e1fc8c Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Fri, 24 Feb 2023 09:47:29 -0500 Subject: [PATCH 04/18] Resolving syntax issue and typo --- src/array.jl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/array.jl b/src/array.jl index 84676f0877..9d9f86c558 100644 --- a/src/array.jl +++ b/src/array.jl @@ -434,9 +434,9 @@ copyto!(dest, 1, src, 1, length(src)) #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination #TO DO: add parameters doffs, soffs, n -for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArray, Mem.Device,Mem.Host) , - (SubArray, StridedSubCuArray, Mem.Host,Mem.Device), - (StridedSubCuArray, StridedSubCuArray, Mem.Device,Mem.Device) ) +for (destType,srcType) in ((StridedSubCuArray, SubArray) , + (SubArray, StridedSubCuArray), + (StridedSubCuArray, StridedSubCuArray) ) @eval begin function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} src_step_x=step(src.indices[1]) @@ -445,6 +445,8 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr dest_step_height=step(dest.indices[2]) src_parent_size=size(parent(src)) dest_parent_size=size(parent(dest)) + destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host + srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2])) @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1])) @@ -459,16 +461,16 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr #In other cases, use parallel threads else CUDA.synchronize() - #@sync + @sync for col in 1:length(src.indices[2]) - #Threads.@spawn begin + Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); srcPos=(1,1,1), dstPos=(1,1,1), srcPitch=sizeof(T)*src_step_x,srcHeight=1, dstPitch=sizeof(T)*dest_step_x, dstHeight=1) CUDA.synchronize() - #end + end end end return dest @@ -481,6 +483,8 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr @boundscheck checkbounds(dest, doffs+n-1) @boundscheck checkbounds(src, soffs) @boundscheck checkbounds(src, soffs+n-1) + destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host + srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, 1, n, 1; From 1fa812e5d455b817f4b413ac98b5292805e4e15f Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Fri, 24 Feb 2023 10:11:46 -0500 Subject: [PATCH 05/18] Typo --- src/array.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/array.jl b/src/array.jl index 9d9f86c558..8bc0120e07 100644 --- a/src/array.jl +++ b/src/array.jl @@ -461,8 +461,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , #In other cases, use parallel threads else CUDA.synchronize() - @sync - for col in 1:length(src.indices[2]) + @sync for col in 1:length(src.indices[2]) Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); From e73bdfd3a87ffb6c4e59215a1670091d14eeaa3a Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Fri, 24 Feb 2023 10:50:53 -0500 Subject: [PATCH 06/18] Testing build without sync --- src/array.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/array.jl b/src/array.jl index 8bc0120e07..38f5e01008 100644 --- a/src/array.jl +++ b/src/array.jl @@ -461,15 +461,16 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , #In other cases, use parallel threads else CUDA.synchronize() - @sync for col in 1:length(src.indices[2]) - Threads.@spawn begin + #@sync + for col in 1:length(src.indices[2]) + #Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); srcPos=(1,1,1), dstPos=(1,1,1), srcPitch=sizeof(T)*src_step_x,srcHeight=1, dstPitch=sizeof(T)*dest_step_x, dstHeight=1) CUDA.synchronize() - end + #end end end return dest From 52f8e29e28ae7034c771296941f6ddedc35dbe5b Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Mon, 27 Feb 2023 21:15:09 -0500 Subject: [PATCH 07/18] Fixing function name to match copyto Base function --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 38f5e01008..c932bac6f8 100644 --- a/src/array.jl +++ b/src/array.jl @@ -476,7 +476,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , return dest end - function copyto_views!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2}, soffs::Integer, + function Base.copyto!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2}, soffs::Integer, n::Integer) where {T,args1,args2} n==0 && return dest @boundscheck checkbounds(dest, doffs) From c9030f4c255154bdd177faec3ff0f453224c25ad Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Mon, 27 Feb 2023 23:34:25 -0500 Subject: [PATCH 08/18] Adding support for mixed views non full arrays copyto --- src/array.jl | 59 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/array.jl b/src/array.jl index c932bac6f8..b892c538b0 100644 --- a/src/array.jl +++ b/src/array.jl @@ -434,22 +434,38 @@ copyto!(dest, 1, src, 1, length(src)) #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination #TO DO: add parameters doffs, soffs, n -for (destType,srcType) in ((StridedSubCuArray, SubArray) , - (SubArray, StridedSubCuArray), - (StridedSubCuArray, StridedSubCuArray) ) +for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSubCuArray), + (StridedSubCuArray, StridedSubCuArray), + (StridedSubCuArray, Array) , (Array, StridedSubCuArray), + (CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray), + (CuArray, SubArray) , (SubArray, CuArray) + ) @eval begin - function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} - src_step_x=step(src.indices[1]) - dest_step_x=step(dest.indices[1]) - src_step_height=step(src.indices[2]) - dest_step_height=step(dest.indices[2]) - src_parent_size=size(parent(src)) - dest_parent_size=size(parent(dest)) - destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host - srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host - - @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2])) - @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1])) + function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} + if (dest isa StridedSubCuArray) || (dest isa SubArray) + dest_step_x=step(dest.indices[1]) + dest_step_height=step(dest.indices[2]) + dest_parent_size=size(parent(dest)) + else + dest_step_x=1 + dest_step_height=1 + dest_parent_size=size(dest) + end + if (src isa StridedSubCuArray) || (src isa SubArray) + src_step_x=step(src.indices[1]) + src_step_height=step(src.indices[2]) + src_parent_size=size(parent(src)) + else + src_step_x=1 + src_step_height=1 + src_parent_size=size(src) + + end + destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host + srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host + @boundscheck checkbounds(view(dest,1,:), 1:size(src,2)) + @boundscheck checkbounds(view(dest,:,1), 1:size(src,1)) + #Non-contigous views can be accomodated by copy3d in certain cases if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) @@ -476,18 +492,20 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , return dest end - function Base.copyto!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2}, soffs::Integer, - n::Integer) where {T,args1,args2} + function Base.copyto!(dest::$destType{T,1},doffs::Integer,src::$srcType{T,1}, soffs::Integer, + n::Integer) where {T} n==0 && return dest @boundscheck checkbounds(dest, doffs) @boundscheck checkbounds(dest, doffs+n-1) @boundscheck checkbounds(src, soffs) @boundscheck checkbounds(src, soffs+n-1) - destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host - srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host + src_step= ((src isa StridedSubCuArray) || (src isa SubArray)) ? step(src.indices[1]) : 1 + dest_step= ((dest isa StridedSubCuArray) || (dest isa SubArray) ) ? step(dest.indices[1]) : 1 + destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host + srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, - 1, n, 1; + 1, 1, n; srcPos=(1,soffs,1), dstPos=(1,doffs,1), srcPitch=src_step*sizeof(T),srcHeight=1, dstPitch=dest_step*sizeof(T), dstHeight=1) @@ -502,7 +520,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , end end - # general case: use CUDA APIs # NOTE: we only switch contexts here to avoid illegal memory accesses. synchronization is From 1376be68c35925608601b9296a4a97c73771fb88 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Mon, 27 Feb 2023 23:34:46 -0500 Subject: [PATCH 09/18] Typos in tests --- test/array.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/array.jl b/test/array.jl index 0438911258..95ebb78fa5 100644 --- a/test/array.jl +++ b/test/array.jl @@ -386,28 +386,28 @@ end gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == cpu_view + @test collect(gpu_view) == gpu_view2 gpu_matrix = CUDA.rand(elty, m,n) gpu_matrix2 = CUDA.rand(elty,l,k) gpu_view= view(gpu_matrix,:, :) gpu_view2= view(gpu_matrix2,1:m, 1:n) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == cpu_view + @test collect(gpu_view) == gpu_view2 gpu_vec = CUDA.rand(elty, m) gpu_vec2 = CUDA.rand(elty,l) gpu_view= view(gpu_vec, 2:3:11) gpu_view2= view(gpu_vec2,1:5:16) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == cpu_view + @test collect(gpu_view) == gpu_view2 gpu_vec = CUDA.rand(elty, m) gpu_vec2 = CUDA.rand(elty,l) gpu_view= view(gpu_vec, :) gpu_view2= view(gpu_vec2,1:m) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == cpu_view + @test collect(gpu_view) == gpu_view2 end end From 84d9b4ba22dfb89ca93af1271190c4beb8eaafca Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Tue, 28 Feb 2023 02:41:17 -0500 Subject: [PATCH 10/18] Adding support for 1D and 2D views of multi-dimensional arrays --- src/array.jl | 70 +++++++++++++++++++++++++++++++++++---------------- test/array.jl | 61 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 98 insertions(+), 33 deletions(-) diff --git a/src/array.jl b/src/array.jl index b892c538b0..47c5e8a927 100644 --- a/src/array.jl +++ b/src/array.jl @@ -442,49 +442,60 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub ) @eval begin function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} - if (dest isa StridedSubCuArray) || (dest isa SubArray) - dest_step_x=step(dest.indices[1]) - dest_step_height=step(dest.indices[2]) + if (dest isa StridedSubCuArray) || (dest isa SubArray) + dest_index1=findfirst(length.(dest.indices).>1) + dest_index2=findnext(length.(dest.indices).>1, dest_index1+1) + dest_step_x=step(dest.indices[dest_index1]) + dest_step_height=step(dest.indices[dest_index2]) dest_parent_size=size(parent(dest)) + dest_pitch1= (dest_index1==1) ? 1 : prod(dest_parent_size[1:(dest_index1-1)]) + dest_pitch2= prod(dest_parent_size[dest_index1:(dest_index2-1)]) else + dest_index1=1 + dest_index2=2 dest_step_x=1 dest_step_height=1 dest_parent_size=size(dest) end if (src isa StridedSubCuArray) || (src isa SubArray) - src_step_x=step(src.indices[1]) - src_step_height=step(src.indices[2]) + src_index1=findfirst(length.(src.indices).>1) + src_index2=findnext(length.(src.indices).>1, src_index1+1) + src_step_x=step(src.indices[src_index1]) + src_step_height=step(src.indices[src_index2]) src_parent_size=size(parent(src)) + src_pitch1= (src_index1==1) ? 1 : prod(src_parent_size[1:(src_index1-1)]) + src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)]) else + src_index1=1 + src_index2=2 src_step_x=1 src_step_height=1 src_parent_size=size(src) - end destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host - @boundscheck checkbounds(view(dest,1,:), 1:size(src,2)) - @boundscheck checkbounds(view(dest,:,1), 1:size(src,1)) + @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1)) + @boundscheck checkbounds(1:size(dest, dest_index2), 1:size(src,src_index2)) #Non-contigous views can be accomodated by copy3d in certain cases - if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) + if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, - 1, size(src,1), size(src,2); + 1, size(src,src_index1), size(src,src_index2); srcPos=(1,1,1), dstPos=(1,1,1), - srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x), - dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x)) + srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1), + dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1)) #In other cases, use parallel threads else CUDA.synchronize() #@sync - for col in 1:length(src.indices[2]) + for col in 1:length(src.indices[src_index2]) #Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, - 1, 1, size(src,1); + 1, 1, size(src,src_index1); srcPos=(1,1,1), dstPos=(1,1,1), - srcPitch=sizeof(T)*src_step_x,srcHeight=1, - dstPitch=sizeof(T)*dest_step_x, dstHeight=1) + srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1, + dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) CUDA.synchronize() #end end @@ -499,16 +510,33 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub @boundscheck checkbounds(dest, doffs+n-1) @boundscheck checkbounds(src, soffs) @boundscheck checkbounds(src, soffs+n-1) - src_step= ((src isa StridedSubCuArray) || (src isa SubArray)) ? step(src.indices[1]) : 1 - dest_step= ((dest isa StridedSubCuArray) || (dest isa SubArray) ) ? step(dest.indices[1]) : 1 + if (dest isa StridedSubCuArray) || (dest isa SubArray) + dest_index=findfirst(length.(dest.indices).>1) + dest_step=step(dest.indices[dest_index]) + dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)]) + else + dest_index=1 + dest_step=1 + dest_pitch=1 + end + + if (src isa StridedSubCuArray) || (src isa SubArray) + src_index=findfirst(length.(src.indices).>1) + src_step=step(src.indices[src_index]) + src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)]) + else + src_index=1 + src_step=1 + src_pitch=1 + end destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, 1, 1, n; - srcPos=(1,soffs,1), dstPos=(1,doffs,1), - srcPitch=src_step*sizeof(T),srcHeight=1, - dstPitch=dest_step*sizeof(T), dstHeight=1) + srcPos=(1,1,soffs), dstPos=(1,1,doffs), + srcPitch=src_step*sizeof(T)*src_pitch,srcHeight=1, + dstPitch=dest_step*sizeof(T)*dest_pitch, dstHeight=1) return dest end diff --git a/test/array.jl b/test/array.jl index 95ebb78fa5..c059e52b5b 100644 --- a/test/array.jl +++ b/test/array.jl @@ -321,12 +321,16 @@ end m=11 k=23 l=19 + range1=2:3:11 + range2=3:2:11 + range3=1:5:16 + range4=4:4:20 #From GPU to CPU gpu_matrix = CUDA.rand(elty, m,n) cpu_matrix = rand(elty,l,k) - gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) - cpu_view= view(cpu_matrix,1:5:16, 4:4:20) + gpu_view= view(gpu_matrix,range1 , range2) + cpu_view= view(cpu_matrix, range3, range4) copyto!(cpu_view,gpu_view) @test collect(gpu_view) == cpu_view @@ -339,8 +343,8 @@ end gpu_vec = CUDA.rand(elty, m) cpu_vec = rand(elty,l) - gpu_view= view(gpu_vec, 2:3:11) - cpu_view= view(cpu_vec,1:5:16) + gpu_view= view(gpu_vec, range1) + cpu_view= view(cpu_vec,range3) copyto!(cpu_view,gpu_view) @test collect(gpu_view) == cpu_view @@ -354,8 +358,8 @@ end #From CPU to GPU gpu_matrix = CUDA.rand(elty, m,n) cpu_matrix = rand(elty,l,k) - gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) - cpu_view= view(cpu_matrix,1:5:16, 4:4:20) + gpu_view= view(gpu_matrix,range1 , range2) + cpu_view= view(cpu_matrix, range3, range4) copyto!(gpu_view,cpu_view) @test collect(gpu_view) == cpu_view @@ -368,8 +372,8 @@ end gpu_vec = CUDA.rand(elty, m) cpu_vec = rand(elty,l) - gpu_view= view(gpu_vec, 2:3:11) - cpu_view= view(cpu_vec,1:5:16) + gpu_view= view(gpu_vec, range1) + cpu_view= view(cpu_vec,range3) copyto!(gpu_view,cpu_view) @test collect(gpu_view) == cpu_view @@ -383,8 +387,8 @@ end #From GPU to GPU gpu_matrix = CUDA.rand(elty, m,n) gpu_matrix2 = CUDA.rand(elty,l,k) - gpu_view= view(gpu_matrix, 2:3:11, 3:2:11) - gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20) + gpu_view= view(gpu_matrix,range1 , range2) + gpu_view2= view(gpu_matrix2,range3, range4) copyto!(gpu_view,gpu_view2) @test collect(gpu_view) == gpu_view2 @@ -397,8 +401,8 @@ end gpu_vec = CUDA.rand(elty, m) gpu_vec2 = CUDA.rand(elty,l) - gpu_view= view(gpu_vec, 2:3:11) - gpu_view2= view(gpu_vec2,1:5:16) + gpu_view= view(gpu_vec, range1) + gpu_view2= view(gpu_vec2, range3) copyto!(gpu_view,gpu_view2) @test collect(gpu_view) == gpu_view2 @@ -409,6 +413,39 @@ end copyto!(gpu_view,gpu_view2) @test collect(gpu_view) == gpu_view2 + #testing higher dimensional views + + for gpu_indices in ( (range1, range2, 3, 7) , (range1, 3, range2, 7), + (range1, 3, 7, range2), (3, range1, range2, 7), + (3, range1, 7, range2), (3,7, range1, range2) ) + for cpu_indices in ( (range3, range4, 11, 5) , (range3, 11, range4, 5), + (range3, 11, 5, range4), (11, range3, range4, 5), + (11, range3, 5, range4), (11,5, range3, range4) ) + gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3) + cpu_matrix = rand(elty,m*2,n*2, k*2, l*2) + gpu_view= view(gpu_matrix, gpu_indices...) + cpu_view= view(cpu_matrix, cpu_indices...) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + end + end + + for gpu_indices in ( (range1, 13, 3, 7) , (3, range1, 7, 13), + (3,7, range1, 13), (3,7, 13, range1)) + for cpu_indices in ( (range3, 11, 2, 5) , (3, range3, 2, 11), + (2,5, range3, 11), (2,5, 11, range3)) + gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3) + cpu_matrix = rand(elty,m*2,n*2, k*2, l*2) + gpu_view= view(gpu_matrix, gpu_indices...) + cpu_view= view(cpu_matrix, cpu_indices...) + copyto!(gpu_view,cpu_view) + @test collect(gpu_view) == cpu_view + + end + end + + end end From 85b3db3479c736d91f0b50395ec5498378ff8597 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Tue, 28 Feb 2023 11:14:26 -0500 Subject: [PATCH 11/18] typos --- src/array.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/array.jl b/src/array.jl index 47c5e8a927..246c246150 100644 --- a/src/array.jl +++ b/src/array.jl @@ -448,8 +448,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub dest_step_x=step(dest.indices[dest_index1]) dest_step_height=step(dest.indices[dest_index2]) dest_parent_size=size(parent(dest)) - dest_pitch1= (dest_index1==1) ? 1 : prod(dest_parent_size[1:(dest_index1-1)]) - dest_pitch2= prod(dest_parent_size[dest_index1:(dest_index2-1)]) else dest_index1=1 dest_index2=2 @@ -463,8 +461,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub src_step_x=step(src.indices[src_index1]) src_step_height=step(src.indices[src_index2]) src_parent_size=size(parent(src)) - src_pitch1= (src_index1==1) ? 1 : prod(src_parent_size[1:(src_index1-1)]) - src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)]) else src_index1=1 src_index2=2 @@ -472,6 +468,10 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub src_step_height=1 src_parent_size=size(src) end + dest_pitch1= (dest_index1==1) ? 1 : prod(dest_parent_size[1:(dest_index1-1)]) + dest_pitch2= prod(dest_parent_size[dest_index1:(dest_index2-1)]) + src_pitch1= (src_index1==1) ? 1 : prod(src_parent_size[1:(src_index1-1)]) + src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)]) destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1)) From 06508213b6338d868b49e60c7242f42e062d0a81 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 01:04:06 -0500 Subject: [PATCH 12/18] Changing copyto!(B,A), A>B, from 2Dcopy to the vectorcopy Base behavior --- src/array.jl | 58 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/src/array.jl b/src/array.jl index 246c246150..a314953dc4 100644 --- a/src/array.jl +++ b/src/array.jl @@ -441,7 +441,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub (CuArray, SubArray) , (SubArray, CuArray) ) @eval begin - function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} + function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T} if (dest isa StridedSubCuArray) || (dest isa SubArray) dest_index1=findfirst(length.(dest.indices).>1) dest_index2=findnext(length.(dest.indices).>1, dest_index1+1) @@ -468,38 +468,68 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub src_step_height=1 src_parent_size=size(src) end + dest_pitch1= (dest_index1==1) ? 1 : prod(dest_parent_size[1:(dest_index1-1)]) dest_pitch2= prod(dest_parent_size[dest_index1:(dest_index2-1)]) src_pitch1= (src_index1==1) ? 1 : prod(src_parent_size[1:(src_index1-1)]) src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)]) destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host - @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1)) - @boundscheck checkbounds(1:size(dest, dest_index2), 1:size(src,src_index2)) + @boundscheck checkbounds(1:size(dest, 1), 1:size(src,1)) + @boundscheck checkbounds(1:size(dest, 2), 1:size(src,2)) - + if (size(dest,1)==size(src,1) || (Copy2D)) #Non-contigous views can be accomodated by copy3d in certain cases - if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) - Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, - 1, size(src,src_index1), size(src,src_index2); + if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) + Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation, + 1, size(src,1), size(src,2); + srcPos=(1,1,1), dstPos=(1,1,1), + srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1), + dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1)) + #In other cases, use parallel threads + else + CUDA.synchronize() + #@sync + for col in 1:length(src.indices[src_index2]) + #Threads.@spawn begin + Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, + 1, 1, size(src,1); srcPos=(1,1,1), dstPos=(1,1,1), - srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1), - dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1)) - #In other cases, use parallel threads - else + srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1, + dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) + CUDA.synchronize() + #end + end + end + else #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false + start_indices=(1:size(src,1):size(src,1)*(size(src,2)+1)) + dest_col=div.(start_indices.-1,size(dest,1)).+1 + start_indices=mod.(start_indices,size(dest,1)) + replace!(start_indices,0=>size(dest,1)) + split_col=start_indices[1:end-1].>start_indices[2:end] + CUDA.synchronize() #@sync for col in 1:length(src.indices[src_index2]) #Threads.@spawn begin - Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, - 1, 1, size(src,src_index1); - srcPos=(1,1,1), dstPos=(1,1,1), + n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1) + Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)), srcLocation, + 1, 1, n; + srcPos=(1,1,1), dstPos=(1,1,start_indices[col]), srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1, dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) + if split_col[col] + Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col]+1)),destLocation, pointer(view(src,:,col)), srcLocation, + 1, 1, size(src,1)-n; + srcPos=(1,1,n+1), dstPos=(1,1,1), + srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1, + dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) + end CUDA.synchronize() #end end end + return dest end From fdcd875e6404f7d2937bf24af80de4f81aea184a Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 02:01:49 -0500 Subject: [PATCH 13/18] Fixing scalar indexing in test comparisons --- test/array.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/array.jl b/test/array.jl index c059e52b5b..0c1529e489 100644 --- a/test/array.jl +++ b/test/array.jl @@ -390,28 +390,28 @@ end gpu_view= view(gpu_matrix,range1 , range2) gpu_view2= view(gpu_matrix2,range3, range4) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == gpu_view2 + @test collect(gpu_view) == collect(gpu_view2) gpu_matrix = CUDA.rand(elty, m,n) gpu_matrix2 = CUDA.rand(elty,l,k) gpu_view= view(gpu_matrix,:, :) gpu_view2= view(gpu_matrix2,1:m, 1:n) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == gpu_view2 + @test collect(gpu_view) == collect(gpu_view2) gpu_vec = CUDA.rand(elty, m) gpu_vec2 = CUDA.rand(elty,l) gpu_view= view(gpu_vec, range1) gpu_view2= view(gpu_vec2, range3) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == gpu_view2 + @test collect(gpu_view) == collect(gpu_view2) gpu_vec = CUDA.rand(elty, m) gpu_vec2 = CUDA.rand(elty,l) gpu_view= view(gpu_vec, :) gpu_view2= view(gpu_vec2,1:m) copyto!(gpu_view,gpu_view2) - @test collect(gpu_view) == gpu_view2 + @test collect(gpu_view) == collect(gpu_view2) #testing higher dimensional views From 024673f63ac9b076ca3f885cc2631335818d96db Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:56:30 -0500 Subject: [PATCH 14/18] Adding support for views of length 1 --- src/array.jl | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/array.jl b/src/array.jl index a314953dc4..df80937800 100644 --- a/src/array.jl +++ b/src/array.jl @@ -438,13 +438,12 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub (StridedSubCuArray, StridedSubCuArray), (StridedSubCuArray, Array) , (Array, StridedSubCuArray), (CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray), - (CuArray, SubArray) , (SubArray, CuArray) - ) + (CuArray, SubArray) , (SubArray, CuArray) ) @eval begin function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T} if (dest isa StridedSubCuArray) || (dest isa SubArray) - dest_index1=findfirst(length.(dest.indices).>1) - dest_index2=findnext(length.(dest.indices).>1, dest_index1+1) + dest_index1=findfirst((typeof.(dest.indices) .<: Int).==0) + dest_index2=findnext((typeof.(dest.indices) .<: Int).==0, dest_index1+1) dest_step_x=step(dest.indices[dest_index1]) dest_step_height=step(dest.indices[dest_index2]) dest_parent_size=size(parent(dest)) @@ -456,8 +455,8 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub dest_parent_size=size(dest) end if (src isa StridedSubCuArray) || (src isa SubArray) - src_index1=findfirst(length.(src.indices).>1) - src_index2=findnext(length.(src.indices).>1, src_index1+1) + src_index1=findfirst((typeof.(src.indices) .<: Int).==0) + src_index2=findnext((typeof.(src.indices) .<: Int).==0, src_index1+1) src_step_x=step(src.indices[src_index1]) src_step_height=step(src.indices[src_index2]) src_parent_size=size(parent(src)) @@ -541,7 +540,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub @boundscheck checkbounds(src, soffs) @boundscheck checkbounds(src, soffs+n-1) if (dest isa StridedSubCuArray) || (dest isa SubArray) - dest_index=findfirst(length.(dest.indices).>1) + dest_index=findfirst((typeof.(dest.indices) .<: Int).==0) dest_step=step(dest.indices[dest_index]) dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)]) else @@ -551,7 +550,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub end if (src isa StridedSubCuArray) || (src isa SubArray) - src_index=findfirst(length.(src.indices).>1) + src_index=findfirst((typeof.(dest.indices) .<: Int).==0) src_step=step(src.indices[src_index]) src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)]) else From 7029f2ef1335a06cfa70ee952a2474c1dfa24460 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 10:57:32 -0500 Subject: [PATCH 15/18] typo --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index df80937800..f1e8764c05 100644 --- a/src/array.jl +++ b/src/array.jl @@ -550,7 +550,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub end if (src isa StridedSubCuArray) || (src isa SubArray) - src_index=findfirst((typeof.(dest.indices) .<: Int).==0) + src_index=findfirst((typeof.(src.indices) .<: Int).==0) src_step=step(src.indices[src_index]) src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)]) else From 4022408c34f6a600cc5a54c88072e46e6802c19d Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:43:41 -0500 Subject: [PATCH 16/18] Adding parallelization of copyto --- src/array.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/array.jl b/src/array.jl index f1e8764c05..a1c81ced8d 100644 --- a/src/array.jl +++ b/src/array.jl @@ -488,16 +488,16 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub #In other cases, use parallel threads else CUDA.synchronize() - #@sync + @sync for col in 1:length(src.indices[src_index2]) - #Threads.@spawn begin + Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); srcPos=(1,1,1), dstPos=(1,1,1), srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1, dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) CUDA.synchronize() - #end + end end end else #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false @@ -508,9 +508,9 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub split_col=start_indices[1:end-1].>start_indices[2:end] CUDA.synchronize() - #@sync + @sync for col in 1:length(src.indices[src_index2]) - #Threads.@spawn begin + Threads.@spawn begin n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1) Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, n; @@ -525,7 +525,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1) end CUDA.synchronize() - #end + end end end From b545545b810dacccd34b44bd0895b5d0386c2afe Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:48:50 -0500 Subject: [PATCH 17/18] Removing spaces --- src/array.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/array.jl b/src/array.jl index a1c81ced8d..993d9f7f94 100644 --- a/src/array.jl +++ b/src/array.jl @@ -488,8 +488,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub #In other cases, use parallel threads else CUDA.synchronize() - @sync - for col in 1:length(src.indices[src_index2]) + @sync for col in 1:length(src.indices[src_index2]) Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); @@ -508,8 +507,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub split_col=start_indices[1:end-1].>start_indices[2:end] CUDA.synchronize() - @sync - for col in 1:length(src.indices[src_index2]) + @sync for col in 1:length(src.indices[src_index2]) Threads.@spawn begin n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1) Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)), srcLocation, From 64d3eced63d770a1262d8febca0b0b65241d6385 Mon Sep 17 00:00:00 2001 From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com> Date: Wed, 1 Mar 2023 15:06:57 -0500 Subject: [PATCH 18/18] Resolving compilation issues --- src/array.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/array.jl b/src/array.jl index 993d9f7f94..191e581ae3 100644 --- a/src/array.jl +++ b/src/array.jl @@ -488,7 +488,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub #In other cases, use parallel threads else CUDA.synchronize() - @sync for col in 1:length(src.indices[src_index2]) + Base.@sync for col in 1:length(src.indices[src_index2]) Threads.@spawn begin Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation, 1, 1, size(src,1); @@ -507,7 +507,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub split_col=start_indices[1:end-1].>start_indices[2:end] CUDA.synchronize() - @sync for col in 1:length(src.indices[src_index2]) + Base.@sync for col in 1:length(src.indices[src_index2]) Threads.@spawn begin n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1) Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)), srcLocation,