From 65aabfde4ff8a435fc6534abe63df067f31ddd7c Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 22 Feb 2023 01:01:03 -0500
Subject: [PATCH 01/18] Adding copyto for non-contigous matrices and vectors

---
 src/array.jl  | 97 +++++++++++++++++++++++++++++++++++++++++++++++++--
 test/array.jl | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 2 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 652792becf..6028f5c163 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -45,7 +45,7 @@ function explain_eltype(@nospecialize(T), depth=0; maxdepth=10)
       msg = "  "^depth * "$T is a mutable type\n"
     elseif hasfieldcount(T)
       msg = "  "^depth * "$T is a struct that's not allocated inline\n"
-      for U in fieldtypes(T)
+      for U in fieldtypes(dt)
           if !Base.allocatedinline(U)
               msg *= explain_nonisbits(U, depth+1)
           end
@@ -300,7 +300,8 @@ end
 
 export DenseCuArray, DenseCuVector, DenseCuMatrix, DenseCuVecOrMat,
        StridedCuArray, StridedCuVector, StridedCuMatrix, StridedCuVecOrMat,
-       AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat
+       AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat,
+       StridedGeneralArray, StridedGeneralMatrix, StridedGeneralVector
 
 # dense arrays: stored contiguously in memory
 #
@@ -324,6 +325,11 @@ const StridedCuVector{T} = StridedCuArray{T,1}
 const StridedCuMatrix{T} = StridedCuArray{T,2}
 const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}}
 
+#union of strided CuArrays and Arrays
+const StridedGeneralArray{T,N} = Union{StridedCuArray{T,N}, StridedArray{T,N}}
+const StridedGeneralVector{T} = StridedGeneralArray{T,1}
+const StridedGeneralMatrix{T} = StridedGeneralArray{T,2}
+
 Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x)
 @inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T
     Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i)
@@ -431,6 +437,93 @@ end
 Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} =
     copyto!(dest, 1, src, 1, length(src))
 
+
+#TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
+#TO DO: add parameters doffs, soffs, n
+function copyto_views!(dest::StridedGeneralMatrix{T},src::StridedGeneralMatrix{T}, 
+                                dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl
+  src_step_x=step(src.indices[1])
+  dest_step_x=step(dest.indices[1])
+  src_step_height=step(src.indices[2])
+  dest_step_height=step(dest.indices[2])
+  src_parent_size=size(parent(src))
+  dest_parent_size=size(parent(dest))
+
+  @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2]))
+  @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1]))
+
+  #Non-contigous views can be accomodated by copy3d in certain cases
+  if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) 
+    Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location,
+                              1, size(src,1), size(src,2);
+                              srcPos=(1,1,1), dstPos=(1,1,1),
+                              srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x),
+                              dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x))
+  #In other cases, use parallel threads
+  else
+    CUDA.synchronize()
+    @sync for col in 1:length(src.indices[2])
+      Threads.@spawn begin
+        println(" Thread index "*string(col))
+        Mem.unsafe_copy3d!(pointer(view(dest,:,col)), dest_location, pointer(view(src,:,col)), src_location,
+                            1, 1, size(src,1);
+                            srcPos=(1,1,1), dstPos=(1,1,1),
+                            srcPitch=sizeof(T)*src_step_x,srcHeight=1,
+                            dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
+        CUDA.synchronize()
+      end
+    end
+  end
+  return dest
+end
+
+Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = 
+  copyto_views!(dest,src,Mem.Device,Mem.Host)
+
+Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = 
+  copyto_views!(dest,src,Mem.Host,Mem.Device)
+
+Base.copyto!(dest::StridedCuMatrix{T}, src::StridedCuMatrix{T} ) where {T} = 
+  copyto_views!(dest,src,Mem.Device,Mem.Device)
+
+function copyto_views!(dest::StridedGeneralVector{T},doffs::Integer,src::StridedGeneralVector{T},  soffs::Integer,
+                              n::Integer,dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl
+  n==0 && return dest
+  @boundscheck checkbounds(dest, doffs)
+  @boundscheck checkbounds(dest, doffs+n-1)
+  @boundscheck checkbounds(src, soffs)
+  @boundscheck checkbounds(src, soffs+n-1)
+
+  src_step=step(src.indices)
+  dest_step=step(dest.indices)
+
+  Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location,
+                            1, n, 1;
+                            srcPos=(1,soffs,1), dstPos=(1,doffs,1),
+                            srcPitch=src_step*sizeof(T),srcHeight=1,
+                            dstPitch=dest_step*sizeof(T), dstHeight=1)
+  return dest
+end
+
+Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedVector{T}, soffs::Integer, n::Integer) where {T} = 
+  copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Host)
+
+Base.copyto!(dest::StridedVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = 
+  copyto_views!(dest,doffs,src,soffs,n,Mem.Host,Mem.Device)
+
+Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = 
+  copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Device)
+
+Base.copyto!(dest::StridedCuArray{T}, src::StridedArray{T}) where {T} =
+  copyto!(dest, 1, src, 1, length(src))
+
+Base.copyto!(dest::StridedArray{T}, src::StridedCuArray{T}) where {T} =
+copyto!(dest, 1, src, 1, length(src))
+
+Base.copyto!(dest::StridedCuArray{T}, src::StridedCuArray{T}) where {T} =
+copyto!(dest, 1, src, 1, length(src))
+
+
 # general case: use CUDA APIs
 
 # NOTE: we only switch contexts here to avoid illegal memory accesses. synchronization is
diff --git a/test/array.jl b/test/array.jl
index 8ef7cf779f..0438911258 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -315,6 +315,103 @@ end
   @test view(b, :, 1, :) isa StridedCuArray
 end
 
+@testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+  @testset "copyto StridedCuArray" begin
+    n=17
+    m=11
+    k=23
+    l=19
+
+    #From GPU to CPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
+    cpu_view= view(cpu_matrix,1:5:16, 4:4:20)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, :, :)
+    cpu_view= view(cpu_matrix,1:m, 1:n)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, 2:3:11)
+    cpu_view= view(cpu_vec,1:5:16)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    cpu_view= view(cpu_vec,1:m)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    #From CPU to GPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
+    cpu_view= view(cpu_matrix,1:5:16, 4:4:20)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, :, :)
+    cpu_view= view(cpu_matrix,1:m, 1:n)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, 2:3:11)
+    cpu_view= view(cpu_vec,1:5:16)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    cpu_view= view(cpu_vec,1:m)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    #From GPU to GPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    gpu_matrix2 = CUDA.rand(elty,l,k)
+    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
+    gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    gpu_matrix2 = CUDA.rand(elty,l,k)
+    gpu_view= view(gpu_matrix,:, :)
+    gpu_view2= view(gpu_matrix2,1:m, 1:n)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    gpu_vec2 = CUDA.rand(elty,l)
+    gpu_view= view(gpu_vec, 2:3:11)
+    gpu_view2= view(gpu_vec2,1:5:16)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    gpu_vec2 = CUDA.rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    gpu_view2= view(gpu_vec2,1:m)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == cpu_view
+
+  end
+end
+
 @testset "accumulate" begin
   for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not
     @test testf(x->accumulate(+, x), rand(n))

From e1f6c1b06f422c1f685da1ea6fb36bfb1a3c9e58 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 22 Feb 2023 01:36:04 -0500
Subject: [PATCH 02/18] Removing typo

---
 src/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index 6028f5c163..348892445d 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -45,7 +45,7 @@ function explain_eltype(@nospecialize(T), depth=0; maxdepth=10)
       msg = "  "^depth * "$T is a mutable type\n"
     elseif hasfieldcount(T)
       msg = "  "^depth * "$T is a struct that's not allocated inline\n"
-      for U in fieldtypes(dt)
+      for U in fieldtypes(T)
           if !Base.allocatedinline(U)
               msg *= explain_nonisbits(U, depth+1)
           end

From d3e93a73ef5528ce49263f8adb7c303f2f10546e Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 22 Feb 2023 17:39:55 -0500
Subject: [PATCH 03/18] Resolving weird syntax that caused compilation error

---
 src/array.jl | 139 +++++++++++++++++++++------------------------------
 1 file changed, 57 insertions(+), 82 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 348892445d..84676f0877 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -300,8 +300,7 @@ end
 
 export DenseCuArray, DenseCuVector, DenseCuMatrix, DenseCuVecOrMat,
        StridedCuArray, StridedCuVector, StridedCuMatrix, StridedCuVecOrMat,
-       AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat,
-       StridedGeneralArray, StridedGeneralMatrix, StridedGeneralVector
+       AnyCuArray, AnyCuVector, AnyCuMatrix, AnyCuVecOrMat
 
 # dense arrays: stored contiguously in memory
 #
@@ -325,11 +324,6 @@ const StridedCuVector{T} = StridedCuArray{T,1}
 const StridedCuMatrix{T} = StridedCuArray{T,2}
 const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}}
 
-#union of strided CuArrays and Arrays
-const StridedGeneralArray{T,N} = Union{StridedCuArray{T,N}, StridedArray{T,N}}
-const StridedGeneralVector{T} = StridedGeneralArray{T,1}
-const StridedGeneralMatrix{T} = StridedGeneralArray{T,2}
-
 Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x)
 @inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T
     Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i)
@@ -435,94 +429,75 @@ function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::DenseCuArray{T
 end
 
 Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} =
-    copyto!(dest, 1, src, 1, length(src))
-
+copyto!(dest, 1, src, 1, length(src))
 
 #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
 #TO DO: add parameters doffs, soffs, n
-function copyto_views!(dest::StridedGeneralMatrix{T},src::StridedGeneralMatrix{T}, 
-                                dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl
-  src_step_x=step(src.indices[1])
-  dest_step_x=step(dest.indices[1])
-  src_step_height=step(src.indices[2])
-  dest_step_height=step(dest.indices[2])
-  src_parent_size=size(parent(src))
-  dest_parent_size=size(parent(dest))
-
-  @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2]))
-  @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1]))
-
-  #Non-contigous views can be accomodated by copy3d in certain cases
-  if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) 
-    Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location,
-                              1, size(src,1), size(src,2);
-                              srcPos=(1,1,1), dstPos=(1,1,1),
-                              srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x),
-                              dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x))
-  #In other cases, use parallel threads
-  else
-    CUDA.synchronize()
-    @sync for col in 1:length(src.indices[2])
-      Threads.@spawn begin
-        println(" Thread index "*string(col))
-        Mem.unsafe_copy3d!(pointer(view(dest,:,col)), dest_location, pointer(view(src,:,col)), src_location,
-                            1, 1, size(src,1);
-                            srcPos=(1,1,1), dstPos=(1,1,1),
-                            srcPitch=sizeof(T)*src_step_x,srcHeight=1,
-                            dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
+
+for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArray,  Mem.Device,Mem.Host) ,
+                                                    (SubArray, StridedSubCuArray,  Mem.Host,Mem.Device), 
+                                                    (StridedSubCuArray, StridedSubCuArray,  Mem.Device,Mem.Device) )
+  @eval begin
+    function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} 
+      src_step_x=step(src.indices[1])
+      dest_step_x=step(dest.indices[1])
+      src_step_height=step(src.indices[2])
+      dest_step_height=step(dest.indices[2])
+      src_parent_size=size(parent(src))
+      dest_parent_size=size(parent(dest))
+
+      @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2]))
+      @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1]))
+
+      #Non-contigous views can be accomodated by copy3d in certain cases
+      if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) 
+        Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
+                                  1, size(src,1), size(src,2);
+                                  srcPos=(1,1,1), dstPos=(1,1,1),
+                                  srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x),
+                                  dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x))
+      #In other cases, use parallel threads
+      else
         CUDA.synchronize()
+        #@sync 
+        for col in 1:length(src.indices[2])
+          #Threads.@spawn begin
+            Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                1, 1, size(src,1);
+                                srcPos=(1,1,1), dstPos=(1,1,1),
+                                srcPitch=sizeof(T)*src_step_x,srcHeight=1,
+                                dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
+            CUDA.synchronize()
+          #end
+        end
       end
+      return dest
     end
-  end
-  return dest
-end
 
-Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = 
-  copyto_views!(dest,src,Mem.Device,Mem.Host)
-
-Base.copyto!(dest::StridedCuMatrix{T}, src::StridedMatrix{T} ) where {T} = 
-  copyto_views!(dest,src,Mem.Host,Mem.Device)
+    function copyto_views!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2},  soffs::Integer,
+                                  n::Integer) where {T,args1,args2} 
+      n==0 && return dest
+      @boundscheck checkbounds(dest, doffs)
+      @boundscheck checkbounds(dest, doffs+n-1)
+      @boundscheck checkbounds(src, soffs)
+      @boundscheck checkbounds(src, soffs+n-1)
+
+      Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
+                                1, n, 1;
+                                srcPos=(1,soffs,1), dstPos=(1,doffs,1),
+                                srcPitch=src_step*sizeof(T),srcHeight=1,
+                                dstPitch=dest_step*sizeof(T), dstHeight=1)
+      return dest
+    end
 
-Base.copyto!(dest::StridedCuMatrix{T}, src::StridedCuMatrix{T} ) where {T} = 
-  copyto_views!(dest,src,Mem.Device,Mem.Device)
 
-function copyto_views!(dest::StridedGeneralVector{T},doffs::Integer,src::StridedGeneralVector{T},  soffs::Integer,
-                              n::Integer,dest_location, src_location) where T #to do: locations need to be typed as Type{<:AbstractBuffer} from CUDA.jl/lib/cudadrv/memory.jl
-  n==0 && return dest
-  @boundscheck checkbounds(dest, doffs)
-  @boundscheck checkbounds(dest, doffs+n-1)
-  @boundscheck checkbounds(src, soffs)
-  @boundscheck checkbounds(src, soffs+n-1)
 
-  src_step=step(src.indices)
-  dest_step=step(dest.indices)
+    Base.copyto!(dest::$destType{T}, src::$srcType{T}) where {T} =
+      copyto!(dest, 1, src, 1, length(src))
 
-  Mem.unsafe_copy3d!(pointer(dest), dest_location, pointer(src), src_location,
-                            1, n, 1;
-                            srcPos=(1,soffs,1), dstPos=(1,doffs,1),
-                            srcPitch=src_step*sizeof(T),srcHeight=1,
-                            dstPitch=dest_step*sizeof(T), dstHeight=1)
-  return dest
+  end
 end
 
-Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedVector{T}, soffs::Integer, n::Integer) where {T} = 
-  copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Host)
-
-Base.copyto!(dest::StridedVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = 
-  copyto_views!(dest,doffs,src,soffs,n,Mem.Host,Mem.Device)
-
-Base.copyto!(dest::StridedCuVector{T}, doffs::Integer, src::StridedCuVector{T}, soffs::Integer, n::Integer) where {T} = 
-  copyto_views!(dest,doffs,src,soffs,n,Mem.Device,Mem.Device)
-
-Base.copyto!(dest::StridedCuArray{T}, src::StridedArray{T}) where {T} =
-  copyto!(dest, 1, src, 1, length(src))
-
-Base.copyto!(dest::StridedArray{T}, src::StridedCuArray{T}) where {T} =
-copyto!(dest, 1, src, 1, length(src))
-
-Base.copyto!(dest::StridedCuArray{T}, src::StridedCuArray{T}) where {T} =
-copyto!(dest, 1, src, 1, length(src))
-
 
 # general case: use CUDA APIs
 

From f2f4d1773d657e228eeb2e4e59a542ba78e1fc8c Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Fri, 24 Feb 2023 09:47:29 -0500
Subject: [PATCH 04/18] Resolving syntax issue and typo

---
 src/array.jl | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 84676f0877..9d9f86c558 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -434,9 +434,9 @@ copyto!(dest, 1, src, 1, length(src))
 #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
 #TO DO: add parameters doffs, soffs, n
 
-for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArray,  Mem.Device,Mem.Host) ,
-                                                    (SubArray, StridedSubCuArray,  Mem.Host,Mem.Device), 
-                                                    (StridedSubCuArray, StridedSubCuArray,  Mem.Device,Mem.Device) )
+for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
+                                                    (SubArray, StridedSubCuArray), 
+                                                    (StridedSubCuArray, StridedSubCuArray) )
   @eval begin
     function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} 
       src_step_x=step(src.indices[1])
@@ -445,6 +445,8 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr
       dest_step_height=step(dest.indices[2])
       src_parent_size=size(parent(src))
       dest_parent_size=size(parent(dest))
+      destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host
+      srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host
 
       @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2]))
       @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1]))
@@ -459,16 +461,16 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr
       #In other cases, use parallel threads
       else
         CUDA.synchronize()
-        #@sync 
+        @sync 
         for col in 1:length(src.indices[2])
-          #Threads.@spawn begin
+          Threads.@spawn begin
             Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                 1, 1, size(src,1);
                                 srcPos=(1,1,1), dstPos=(1,1,1),
                                 srcPitch=sizeof(T)*src_step_x,srcHeight=1,
                                 dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
             CUDA.synchronize()
-          #end
+          end
         end
       end
       return dest
@@ -481,6 +483,8 @@ for (destType,srcType, destLocation, SrcLocation) in ((StridedSubCuArray, SubArr
       @boundscheck checkbounds(dest, doffs+n-1)
       @boundscheck checkbounds(src, soffs)
       @boundscheck checkbounds(src, soffs+n-1)
+      destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host
+      srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host
 
       Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
                                 1, n, 1;

From 1fa812e5d455b817f4b413ac98b5292805e4e15f Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Fri, 24 Feb 2023 10:11:46 -0500
Subject: [PATCH 05/18] Typo

---
 src/array.jl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 9d9f86c558..8bc0120e07 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -461,8 +461,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
       #In other cases, use parallel threads
       else
         CUDA.synchronize()
-        @sync 
-        for col in 1:length(src.indices[2])
+        @sync for col in 1:length(src.indices[2])
           Threads.@spawn begin
             Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                 1, 1, size(src,1);

From e73bdfd3a87ffb6c4e59215a1670091d14eeaa3a Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Fri, 24 Feb 2023 10:50:53 -0500
Subject: [PATCH 06/18] Testing build without sync

---
 src/array.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 8bc0120e07..38f5e01008 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -461,15 +461,16 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
       #In other cases, use parallel threads
       else
         CUDA.synchronize()
-        @sync for col in 1:length(src.indices[2])
-          Threads.@spawn begin
+        #@sync 
+        for col in 1:length(src.indices[2])
+          #Threads.@spawn begin
             Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                 1, 1, size(src,1);
                                 srcPos=(1,1,1), dstPos=(1,1,1),
                                 srcPitch=sizeof(T)*src_step_x,srcHeight=1,
                                 dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
             CUDA.synchronize()
-          end
+          #end
         end
       end
       return dest

From 52f8e29e28ae7034c771296941f6ddedc35dbe5b Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Mon, 27 Feb 2023 21:15:09 -0500
Subject: [PATCH 07/18] Fixing function name to match copyto Base function

---
 src/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index 38f5e01008..c932bac6f8 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -476,7 +476,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
       return dest
     end
 
-    function copyto_views!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2},  soffs::Integer,
+    function Base.copyto!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2},  soffs::Integer,
                                   n::Integer) where {T,args1,args2} 
       n==0 && return dest
       @boundscheck checkbounds(dest, doffs)

From c9030f4c255154bdd177faec3ff0f453224c25ad Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Mon, 27 Feb 2023 23:34:25 -0500
Subject: [PATCH 08/18] Adding support for mixed views non full arrays copyto

---
 src/array.jl | 59 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index c932bac6f8..b892c538b0 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -434,22 +434,38 @@ copyto!(dest, 1, src, 1, length(src))
 #TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
 #TO DO: add parameters doffs, soffs, n
 
-for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
-                                                    (SubArray, StridedSubCuArray), 
-                                                    (StridedSubCuArray, StridedSubCuArray) )
+for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSubCuArray), 
+                            (StridedSubCuArray, StridedSubCuArray),
+                            (StridedSubCuArray, Array) ,  (Array, StridedSubCuArray), 
+                            (CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray),
+                            (CuArray, SubArray) , (SubArray, CuArray) 
+                          )
   @eval begin
-    function Base.copyto!(dest::$destType{T,2,args1},src::$srcType{T,2,args2}) where {T,args1,args2} 
-      src_step_x=step(src.indices[1])
-      dest_step_x=step(dest.indices[1])
-      src_step_height=step(src.indices[2])
-      dest_step_height=step(dest.indices[2])
-      src_parent_size=size(parent(src))
-      dest_parent_size=size(parent(dest))
-      destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host
-      srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host
-
-      @boundscheck checkbounds(view(dest,1,:), 1:length(src.indices[2]))
-      @boundscheck checkbounds(view(dest,:,1), 1: length(src.indices[1]))
+    function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} 
+      if (dest isa StridedSubCuArray) || (dest isa SubArray) 
+        dest_step_x=step(dest.indices[1])
+        dest_step_height=step(dest.indices[2])
+        dest_parent_size=size(parent(dest))
+      else
+        dest_step_x=1
+        dest_step_height=1
+        dest_parent_size=size(dest)
+      end
+      if (src isa StridedSubCuArray) || (src isa SubArray)
+        src_step_x=step(src.indices[1])
+        src_step_height=step(src.indices[2])
+        src_parent_size=size(parent(src)) 
+      else
+        src_step_x=1
+        src_step_height=1
+        src_parent_size=size(src) 
+        
+      end
+      destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
+      srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
+      @boundscheck checkbounds(view(dest,1,:), 1:size(src,2))
+      @boundscheck checkbounds(view(dest,:,1), 1:size(src,1))
+      
 
       #Non-contigous views can be accomodated by copy3d in certain cases
       if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) 
@@ -476,18 +492,20 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
       return dest
     end
 
-    function Base.copyto!(dest::$destType{T,1,args1},doffs::Integer,src::$srcType{T,1,args2},  soffs::Integer,
-                                  n::Integer) where {T,args1,args2} 
+    function Base.copyto!(dest::$destType{T,1},doffs::Integer,src::$srcType{T,1},  soffs::Integer,
+                                  n::Integer) where {T} 
       n==0 && return dest
       @boundscheck checkbounds(dest, doffs)
       @boundscheck checkbounds(dest, doffs+n-1)
       @boundscheck checkbounds(src, soffs)
       @boundscheck checkbounds(src, soffs+n-1)
-      destLocation= (dest isa StridedSubCuArray) ? Mem.Device : Mem.Host
-      srcLocation= (src isa StridedSubCuArray) ? Mem.Device : Mem.Host
+      src_step= ((src isa StridedSubCuArray) || (src isa SubArray)) ? step(src.indices[1]) : 1
+      dest_step= ((dest isa StridedSubCuArray) || (dest isa SubArray) ) ? step(dest.indices[1]) : 1
+      destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
+      srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
 
       Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
-                                1, n, 1;
+                                1, 1, n;
                                 srcPos=(1,soffs,1), dstPos=(1,doffs,1),
                                 srcPitch=src_step*sizeof(T),srcHeight=1,
                                 dstPitch=dest_step*sizeof(T), dstHeight=1)
@@ -502,7 +520,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) ,
   end
 end
 
-
 # general case: use CUDA APIs
 
 # NOTE: we only switch contexts here to avoid illegal memory accesses. synchronization is

From 1376be68c35925608601b9296a4a97c73771fb88 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Mon, 27 Feb 2023 23:34:46 -0500
Subject: [PATCH 09/18] Typos in tests

---
 test/array.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/array.jl b/test/array.jl
index 0438911258..95ebb78fa5 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -386,28 +386,28 @@ end
     gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
     gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == cpu_view
+    @test collect(gpu_view) == gpu_view2
 
     gpu_matrix = CUDA.rand(elty, m,n)
     gpu_matrix2 = CUDA.rand(elty,l,k)
     gpu_view= view(gpu_matrix,:, :)
     gpu_view2= view(gpu_matrix2,1:m, 1:n)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == cpu_view
+    @test collect(gpu_view) == gpu_view2
 
     gpu_vec = CUDA.rand(elty, m)
     gpu_vec2 = CUDA.rand(elty,l)
     gpu_view= view(gpu_vec, 2:3:11)
     gpu_view2= view(gpu_vec2,1:5:16)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == cpu_view
+    @test collect(gpu_view) == gpu_view2
 
     gpu_vec = CUDA.rand(elty, m)
     gpu_vec2 = CUDA.rand(elty,l)
     gpu_view= view(gpu_vec, :)
     gpu_view2= view(gpu_vec2,1:m)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == cpu_view
+    @test collect(gpu_view) == gpu_view2
 
   end
 end

From 84d9b4ba22dfb89ca93af1271190c4beb8eaafca Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Tue, 28 Feb 2023 02:41:17 -0500
Subject: [PATCH 10/18] Adding support for 1D and 2D views of multi-dimensional
 arrays

---
 src/array.jl  | 70 +++++++++++++++++++++++++++++++++++----------------
 test/array.jl | 61 +++++++++++++++++++++++++++++++++++---------
 2 files changed, 98 insertions(+), 33 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index b892c538b0..47c5e8a927 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -442,49 +442,60 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
                           )
   @eval begin
     function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} 
-      if (dest isa StridedSubCuArray) || (dest isa SubArray) 
-        dest_step_x=step(dest.indices[1])
-        dest_step_height=step(dest.indices[2])
+      if (dest isa StridedSubCuArray) || (dest isa SubArray)
+        dest_index1=findfirst(length.(dest.indices).>1)
+        dest_index2=findnext(length.(dest.indices).>1, dest_index1+1)
+        dest_step_x=step(dest.indices[dest_index1])
+        dest_step_height=step(dest.indices[dest_index2])
         dest_parent_size=size(parent(dest))
+        dest_pitch1= (dest_index1==1) ? 1 :  prod(dest_parent_size[1:(dest_index1-1)])
+        dest_pitch2=  prod(dest_parent_size[dest_index1:(dest_index2-1)])
       else
+        dest_index1=1
+        dest_index2=2
         dest_step_x=1
         dest_step_height=1
         dest_parent_size=size(dest)
       end
       if (src isa StridedSubCuArray) || (src isa SubArray)
-        src_step_x=step(src.indices[1])
-        src_step_height=step(src.indices[2])
+        src_index1=findfirst(length.(src.indices).>1)
+        src_index2=findnext(length.(src.indices).>1, src_index1+1)
+        src_step_x=step(src.indices[src_index1])
+        src_step_height=step(src.indices[src_index2])
         src_parent_size=size(parent(src)) 
+        src_pitch1= (src_index1==1) ? 1 :  prod(src_parent_size[1:(src_index1-1)])
+        src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
       else
+        src_index1=1
+        src_index2=2
         src_step_x=1
         src_step_height=1
         src_parent_size=size(src) 
-        
       end
       destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
       srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
-      @boundscheck checkbounds(view(dest,1,:), 1:size(src,2))
-      @boundscheck checkbounds(view(dest,:,1), 1:size(src,1))
+      @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1))
+      @boundscheck checkbounds(1:size(dest, dest_index2), 1:size(src,src_index2))
       
 
       #Non-contigous views can be accomodated by copy3d in certain cases
-      if isinteger(src_parent_size[1]*src_step_height/src_step_x) && isinteger(dest_parent_size[1]*dest_step_height/dest_step_x) 
+      if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) 
         Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
-                                  1, size(src,1), size(src,2);
+                                  1, size(src,src_index1), size(src,src_index2);
                                   srcPos=(1,1,1), dstPos=(1,1,1),
-                                  srcPitch=src_step_x*sizeof(T),srcHeight=Int(src_parent_size[1]*src_step_height/src_step_x),
-                                  dstPitch=dest_step_x*sizeof(T), dstHeight=Int(dest_parent_size[1]*dest_step_height/dest_step_x))
+                                  srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1),
+                                  dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1))
       #In other cases, use parallel threads
       else
         CUDA.synchronize()
         #@sync 
-        for col in 1:length(src.indices[2])
+        for col in 1:length(src.indices[src_index2])
           #Threads.@spawn begin
             Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
-                                1, 1, size(src,1);
+                                1, 1, size(src,src_index1);
                                 srcPos=(1,1,1), dstPos=(1,1,1),
-                                srcPitch=sizeof(T)*src_step_x,srcHeight=1,
-                                dstPitch=sizeof(T)*dest_step_x, dstHeight=1)
+                                srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
             CUDA.synchronize()
           #end
         end
@@ -499,16 +510,33 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
       @boundscheck checkbounds(dest, doffs+n-1)
       @boundscheck checkbounds(src, soffs)
       @boundscheck checkbounds(src, soffs+n-1)
-      src_step= ((src isa StridedSubCuArray) || (src isa SubArray)) ? step(src.indices[1]) : 1
-      dest_step= ((dest isa StridedSubCuArray) || (dest isa SubArray) ) ? step(dest.indices[1]) : 1
+      if (dest isa StridedSubCuArray) || (dest isa SubArray)
+        dest_index=findfirst(length.(dest.indices).>1)
+        dest_step=step(dest.indices[dest_index])
+        dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)])
+      else
+        dest_index=1
+        dest_step=1
+        dest_pitch=1
+      end
+
+      if (src isa StridedSubCuArray) || (src isa SubArray)
+        src_index=findfirst(length.(src.indices).>1)
+        src_step=step(src.indices[src_index])
+        src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)])
+      else
+        src_index=1
+        src_step=1
+        src_pitch=1
+      end
       destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
       srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
 
       Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
                                 1, 1, n;
-                                srcPos=(1,soffs,1), dstPos=(1,doffs,1),
-                                srcPitch=src_step*sizeof(T),srcHeight=1,
-                                dstPitch=dest_step*sizeof(T), dstHeight=1)
+                                srcPos=(1,1,soffs), dstPos=(1,1,doffs),
+                                srcPitch=src_step*sizeof(T)*src_pitch,srcHeight=1,
+                                dstPitch=dest_step*sizeof(T)*dest_pitch, dstHeight=1)
       return dest
     end
 
diff --git a/test/array.jl b/test/array.jl
index 95ebb78fa5..c059e52b5b 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -321,12 +321,16 @@ end
     m=11
     k=23
     l=19
+    range1=2:3:11
+    range2=3:2:11
+    range3=1:5:16
+    range4=4:4:20
 
     #From GPU to CPU
     gpu_matrix = CUDA.rand(elty, m,n)
     cpu_matrix = rand(elty,l,k)
-    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
-    cpu_view= view(cpu_matrix,1:5:16, 4:4:20)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    cpu_view= view(cpu_matrix, range3, range4)
     copyto!(cpu_view,gpu_view)
     @test collect(gpu_view) == cpu_view
 
@@ -339,8 +343,8 @@ end
 
     gpu_vec = CUDA.rand(elty, m)
     cpu_vec = rand(elty,l)
-    gpu_view= view(gpu_vec, 2:3:11)
-    cpu_view= view(cpu_vec,1:5:16)
+    gpu_view= view(gpu_vec, range1)
+    cpu_view= view(cpu_vec,range3)
     copyto!(cpu_view,gpu_view)
     @test collect(gpu_view) == cpu_view
 
@@ -354,8 +358,8 @@ end
     #From CPU to GPU
     gpu_matrix = CUDA.rand(elty, m,n)
     cpu_matrix = rand(elty,l,k)
-    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
-    cpu_view= view(cpu_matrix,1:5:16, 4:4:20)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    cpu_view= view(cpu_matrix, range3, range4)
     copyto!(gpu_view,cpu_view)
     @test collect(gpu_view) == cpu_view
 
@@ -368,8 +372,8 @@ end
 
     gpu_vec = CUDA.rand(elty, m)
     cpu_vec = rand(elty,l)
-    gpu_view= view(gpu_vec, 2:3:11)
-    cpu_view= view(cpu_vec,1:5:16)
+    gpu_view= view(gpu_vec, range1)
+    cpu_view= view(cpu_vec,range3)
     copyto!(gpu_view,cpu_view)
     @test collect(gpu_view) == cpu_view
 
@@ -383,8 +387,8 @@ end
     #From GPU to GPU
     gpu_matrix = CUDA.rand(elty, m,n)
     gpu_matrix2 = CUDA.rand(elty,l,k)
-    gpu_view= view(gpu_matrix, 2:3:11, 3:2:11)
-    gpu_view2= view(gpu_matrix2,1:5:16, 4:4:20)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    gpu_view2= view(gpu_matrix2,range3, range4)
     copyto!(gpu_view,gpu_view2)
     @test collect(gpu_view) == gpu_view2
 
@@ -397,8 +401,8 @@ end
 
     gpu_vec = CUDA.rand(elty, m)
     gpu_vec2 = CUDA.rand(elty,l)
-    gpu_view= view(gpu_vec, 2:3:11)
-    gpu_view2= view(gpu_vec2,1:5:16)
+    gpu_view= view(gpu_vec, range1)
+    gpu_view2= view(gpu_vec2, range3)
     copyto!(gpu_view,gpu_view2)
     @test collect(gpu_view) == gpu_view2
 
@@ -409,6 +413,39 @@ end
     copyto!(gpu_view,gpu_view2)
     @test collect(gpu_view) == gpu_view2
 
+    #testing higher dimensional views
+
+    for gpu_indices in ( (range1, range2, 3, 7) , (range1, 3, range2, 7), 
+                        (range1, 3, 7, range2), (3, range1, range2, 7),  
+                        (3, range1, 7, range2), (3,7, range1, range2) )
+      for cpu_indices in ( (range3, range4, 11, 5) , (range3, 11, range4, 5), 
+                          (range3, 11, 5, range4), (11, range3, range4, 5),  
+                         (11, range3, 5, range4), (11,5, range3, range4)   )
+        gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
+        cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
+        gpu_view= view(gpu_matrix, gpu_indices...)
+        cpu_view= view(cpu_matrix, cpu_indices...) 
+        copyto!(gpu_view,cpu_view)
+        @test collect(gpu_view) == cpu_view
+
+      end
+    end
+    
+    for gpu_indices in ( (range1, 13, 3, 7) , (3, range1, 7, 13), 
+                        (3,7, range1, 13),  (3,7, 13, range1))
+      for cpu_indices in ( (range3, 11, 2, 5) , (3, range3, 2, 11), 
+                          (2,5, range3, 11),  (2,5, 11, range3))
+        gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
+        cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
+        gpu_view= view(gpu_matrix, gpu_indices...)
+        cpu_view= view(cpu_matrix, cpu_indices...) 
+        copyto!(gpu_view,cpu_view)
+        @test collect(gpu_view) == cpu_view
+
+      end
+    end
+
+
   end
 end
 

From 85b3db3479c736d91f0b50395ec5498378ff8597 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Tue, 28 Feb 2023 11:14:26 -0500
Subject: [PATCH 11/18] typos

---
 src/array.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 47c5e8a927..246c246150 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -448,8 +448,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         dest_step_x=step(dest.indices[dest_index1])
         dest_step_height=step(dest.indices[dest_index2])
         dest_parent_size=size(parent(dest))
-        dest_pitch1= (dest_index1==1) ? 1 :  prod(dest_parent_size[1:(dest_index1-1)])
-        dest_pitch2=  prod(dest_parent_size[dest_index1:(dest_index2-1)])
       else
         dest_index1=1
         dest_index2=2
@@ -463,8 +461,6 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         src_step_x=step(src.indices[src_index1])
         src_step_height=step(src.indices[src_index2])
         src_parent_size=size(parent(src)) 
-        src_pitch1= (src_index1==1) ? 1 :  prod(src_parent_size[1:(src_index1-1)])
-        src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
       else
         src_index1=1
         src_index2=2
@@ -472,6 +468,10 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         src_step_height=1
         src_parent_size=size(src) 
       end
+      dest_pitch1= (dest_index1==1) ? 1 :  prod(dest_parent_size[1:(dest_index1-1)])
+      dest_pitch2=  prod(dest_parent_size[dest_index1:(dest_index2-1)])
+      src_pitch1= (src_index1==1) ? 1 :  prod(src_parent_size[1:(src_index1-1)])
+      src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
       destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
       srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
       @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1))

From 06508213b6338d868b49e60c7242f42e062d0a81 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 01:04:06 -0500
Subject: [PATCH 12/18] Changing copyto!(B,A), A>B, from 2Dcopy to the
 vectorcopy Base behavior

---
 src/array.jl | 58 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 246c246150..a314953dc4 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -441,7 +441,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
                             (CuArray, SubArray) , (SubArray, CuArray) 
                           )
   @eval begin
-    function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}) where {T} 
+    function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T} 
       if (dest isa StridedSubCuArray) || (dest isa SubArray)
         dest_index1=findfirst(length.(dest.indices).>1)
         dest_index2=findnext(length.(dest.indices).>1, dest_index1+1)
@@ -468,38 +468,68 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         src_step_height=1
         src_parent_size=size(src) 
       end
+
       dest_pitch1= (dest_index1==1) ? 1 :  prod(dest_parent_size[1:(dest_index1-1)])
       dest_pitch2=  prod(dest_parent_size[dest_index1:(dest_index2-1)])
       src_pitch1= (src_index1==1) ? 1 :  prod(src_parent_size[1:(src_index1-1)])
       src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
       destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
       srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
-      @boundscheck checkbounds(1:size(dest, dest_index1), 1:size(src,src_index1))
-      @boundscheck checkbounds(1:size(dest, dest_index2), 1:size(src,src_index2))
+      @boundscheck checkbounds(1:size(dest, 1), 1:size(src,1))
+      @boundscheck checkbounds(1:size(dest, 2), 1:size(src,2))
       
-
+      if (size(dest,1)==size(src,1) || (Copy2D))
       #Non-contigous views can be accomodated by copy3d in certain cases
-      if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) 
-        Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
-                                  1, size(src,src_index1), size(src,src_index2);
+        if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) 
+          Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
+                                    1, size(src,1), size(src,2);
+                                    srcPos=(1,1,1), dstPos=(1,1,1),
+                                    srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1),
+                                    dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1))
+        #In other cases, use parallel threads
+        else
+          CUDA.synchronize()
+          #@sync 
+          for col in 1:length(src.indices[src_index2])
+            #Threads.@spawn begin
+              Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                  1, 1, size(src,1);
                                   srcPos=(1,1,1), dstPos=(1,1,1),
-                                  srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1),
-                                  dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1))
-      #In other cases, use parallel threads
-      else
+                                  srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                  dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+              CUDA.synchronize()
+            #end
+          end
+        end
+      else  #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false
+        start_indices=(1:size(src,1):size(src,1)*(size(src,2)+1))
+        dest_col=div.(start_indices.-1,size(dest,1)).+1
+        start_indices=mod.(start_indices,size(dest,1))
+        replace!(start_indices,0=>size(dest,1))
+        split_col=start_indices[1:end-1].>start_indices[2:end]
+
         CUDA.synchronize()
         #@sync 
         for col in 1:length(src.indices[src_index2])
           #Threads.@spawn begin
-            Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
-                                1, 1, size(src,src_index1);
-                                srcPos=(1,1,1), dstPos=(1,1,1),
+            n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
+            Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                1, 1, n;
+                                srcPos=(1,1,1), dstPos=(1,1,start_indices[col]),
                                 srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
                                 dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+            if split_col[col]
+              Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col]+1)),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                1, 1, size(src,1)-n;
+                                srcPos=(1,1,n+1), dstPos=(1,1,1),
+                                srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+            end
             CUDA.synchronize()
           #end
         end
       end
+
       return dest
     end
 

From fdcd875e6404f7d2937bf24af80de4f81aea184a Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 02:01:49 -0500
Subject: [PATCH 13/18] Fixing scalar indexing in test comparisons

---
 test/array.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/array.jl b/test/array.jl
index c059e52b5b..0c1529e489 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -390,28 +390,28 @@ end
     gpu_view= view(gpu_matrix,range1 , range2)
     gpu_view2= view(gpu_matrix2,range3, range4)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == gpu_view2
+    @test collect(gpu_view) == collect(gpu_view2)
 
     gpu_matrix = CUDA.rand(elty, m,n)
     gpu_matrix2 = CUDA.rand(elty,l,k)
     gpu_view= view(gpu_matrix,:, :)
     gpu_view2= view(gpu_matrix2,1:m, 1:n)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == gpu_view2
+    @test collect(gpu_view) == collect(gpu_view2)
 
     gpu_vec = CUDA.rand(elty, m)
     gpu_vec2 = CUDA.rand(elty,l)
     gpu_view= view(gpu_vec, range1)
     gpu_view2= view(gpu_vec2, range3)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == gpu_view2
+    @test collect(gpu_view) == collect(gpu_view2)
 
     gpu_vec = CUDA.rand(elty, m)
     gpu_vec2 = CUDA.rand(elty,l)
     gpu_view= view(gpu_vec, :)
     gpu_view2= view(gpu_vec2,1:m)
     copyto!(gpu_view,gpu_view2)
-    @test collect(gpu_view) == gpu_view2
+    @test collect(gpu_view) == collect(gpu_view2)
 
     #testing higher dimensional views
 

From 024673f63ac9b076ca3f885cc2631335818d96db Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 10:56:30 -0500
Subject: [PATCH 14/18] Adding support for views of length 1

---
 src/array.jl | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index a314953dc4..df80937800 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -438,13 +438,12 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
                             (StridedSubCuArray, StridedSubCuArray),
                             (StridedSubCuArray, Array) ,  (Array, StridedSubCuArray), 
                             (CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray),
-                            (CuArray, SubArray) , (SubArray, CuArray) 
-                          )
+                            (CuArray, SubArray) , (SubArray, CuArray) )
   @eval begin
     function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T} 
       if (dest isa StridedSubCuArray) || (dest isa SubArray)
-        dest_index1=findfirst(length.(dest.indices).>1)
-        dest_index2=findnext(length.(dest.indices).>1, dest_index1+1)
+        dest_index1=findfirst((typeof.(dest.indices) .<: Int).==0)
+        dest_index2=findnext((typeof.(dest.indices) .<: Int).==0, dest_index1+1)
         dest_step_x=step(dest.indices[dest_index1])
         dest_step_height=step(dest.indices[dest_index2])
         dest_parent_size=size(parent(dest))
@@ -456,8 +455,8 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         dest_parent_size=size(dest)
       end
       if (src isa StridedSubCuArray) || (src isa SubArray)
-        src_index1=findfirst(length.(src.indices).>1)
-        src_index2=findnext(length.(src.indices).>1, src_index1+1)
+        src_index1=findfirst((typeof.(src.indices) .<: Int).==0)
+        src_index2=findnext((typeof.(src.indices) .<: Int).==0, src_index1+1)
         src_step_x=step(src.indices[src_index1])
         src_step_height=step(src.indices[src_index2])
         src_parent_size=size(parent(src)) 
@@ -541,7 +540,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
       @boundscheck checkbounds(src, soffs)
       @boundscheck checkbounds(src, soffs+n-1)
       if (dest isa StridedSubCuArray) || (dest isa SubArray)
-        dest_index=findfirst(length.(dest.indices).>1)
+        dest_index=findfirst((typeof.(dest.indices) .<: Int).==0)
         dest_step=step(dest.indices[dest_index])
         dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)])
       else
@@ -551,7 +550,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
       end
 
       if (src isa StridedSubCuArray) || (src isa SubArray)
-        src_index=findfirst(length.(src.indices).>1)
+        src_index=findfirst((typeof.(dest.indices) .<: Int).==0)
         src_step=step(src.indices[src_index])
         src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)])
       else

From 7029f2ef1335a06cfa70ee952a2474c1dfa24460 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 10:57:32 -0500
Subject: [PATCH 15/18] typo

---
 src/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index df80937800..f1e8764c05 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -550,7 +550,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
       end
 
       if (src isa StridedSubCuArray) || (src isa SubArray)
-        src_index=findfirst((typeof.(dest.indices) .<: Int).==0)
+        src_index=findfirst((typeof.(src.indices) .<: Int).==0)
         src_step=step(src.indices[src_index])
         src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)])
       else

From 4022408c34f6a600cc5a54c88072e46e6802c19d Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 13:43:41 -0500
Subject: [PATCH 16/18] Adding parallelization of copyto

---
 src/array.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index f1e8764c05..a1c81ced8d 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -488,16 +488,16 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         #In other cases, use parallel threads
         else
           CUDA.synchronize()
-          #@sync 
+          @sync 
           for col in 1:length(src.indices[src_index2])
-            #Threads.@spawn begin
+            Threads.@spawn begin
               Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                   1, 1, size(src,1);
                                   srcPos=(1,1,1), dstPos=(1,1,1),
                                   srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
                                   dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
               CUDA.synchronize()
-            #end
+            end
           end
         end
       else  #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false
@@ -508,9 +508,9 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         split_col=start_indices[1:end-1].>start_indices[2:end]
 
         CUDA.synchronize()
-        #@sync 
+        @sync 
         for col in 1:length(src.indices[src_index2])
-          #Threads.@spawn begin
+          Threads.@spawn begin
             n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
             Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)),  srcLocation,
                                 1, 1, n;
@@ -525,7 +525,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
                                 dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
             end
             CUDA.synchronize()
-          #end
+          end
         end
       end
 

From b545545b810dacccd34b44bd0895b5d0386c2afe Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 13:48:50 -0500
Subject: [PATCH 17/18] Removing spaces

---
 src/array.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index a1c81ced8d..993d9f7f94 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -488,8 +488,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         #In other cases, use parallel threads
         else
           CUDA.synchronize()
-          @sync 
-          for col in 1:length(src.indices[src_index2])
+          @sync for col in 1:length(src.indices[src_index2])
             Threads.@spawn begin
               Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                   1, 1, size(src,1);
@@ -508,8 +507,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         split_col=start_indices[1:end-1].>start_indices[2:end]
 
         CUDA.synchronize()
-        @sync 
-        for col in 1:length(src.indices[src_index2])
+        @sync for col in 1:length(src.indices[src_index2])
           Threads.@spawn begin
             n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
             Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)),  srcLocation,

From 64d3eced63d770a1262d8febca0b0b65241d6385 Mon Sep 17 00:00:00 2001
From: Evelyne <110474206+evelyne-ringoot@users.noreply.github.com>
Date: Wed, 1 Mar 2023 15:06:57 -0500
Subject: [PATCH 18/18] Resolving compilation issues

---
 src/array.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 993d9f7f94..191e581ae3 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -488,7 +488,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         #In other cases, use parallel threads
         else
           CUDA.synchronize()
-          @sync for col in 1:length(src.indices[src_index2])
+          Base.@sync for col in 1:length(src.indices[src_index2])
             Threads.@spawn begin
               Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
                                   1, 1, size(src,1);
@@ -507,7 +507,7 @@ for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSub
         split_col=start_indices[1:end-1].>start_indices[2:end]
 
         CUDA.synchronize()
-        @sync for col in 1:length(src.indices[src_index2])
+        Base.@sync for col in 1:length(src.indices[src_index2])
           Threads.@spawn begin
             n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
             Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)),  srcLocation,