Add an index typevar to CuDeviceArray.

JuliaGPU · Sep 11, 2023 · 9b1d55d · 9b1d55d
1 parent c97bc77
commit 9b1d55d
Show file tree

Hide file tree

Showing 6 changed files with 40 additions and 31 deletions.
diff --git a/src/device/array.jl b/src/device/array.jl
@@ -6,31 +6,38 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
 ## construction
 
 """
-    CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
+    CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])
 
 Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
-pointer, where `N` is determined from the length of `dims` and `T` is determined from the
-type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
-lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
-then it must match the length of `dims`. The same applies to the element type `T`, which
-should match the type of the pointer `ptr`.
+pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
+to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
+stored in the array, and is determined automatically if not specified. `I` is the integer
+type used to store the size of the array, and is determined automatically if not specified.
 """
 CuDeviceArray
 
-# NOTE: we can't support the typical `tuple or series of integer` style construction,
-#       because we're currently requiring a trailing pointer argument.
-
-struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
+struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
     ptr::LLVMPtr{T,A}
-    maxsize::Int
-
-    dims::Dims{N}
-    len::Int
+    maxsize::I
+
+    dims::NTuple{N,I}
+    len::I
+
+    # determine an index type based on the size of the array.
+    # this is type unstable, so only use this constructor from the host side.
+    function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                                  maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
+        if maxsize <= typemax(Int32)
+            CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
+        else
+            CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
+        end
+    end
 
-    # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
-    CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
-                         maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
-        new(ptr, maxsize, dims, prod(dims))
+    # fully typed, for use in device code
+    CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
+                           maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
+        new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
 end
 
 const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
@@ -224,18 +231,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
     end
 end
 
-function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
+function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
   err = GPUArrays._reinterpret_exception(T, a)
   err === nothing || throw(err)
 
   if sizeof(T) == sizeof(S) # fast case
-    return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
+    return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
   end
 
   isize = size(a)
   size1 = div(isize[1]*sizeof(S), sizeof(T))
   osize = tuple(size1, Base.tail(isize)...)
-  return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
+  return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
 end
 
 
@@ -252,7 +259,7 @@ function Base.reshape(a::CuDeviceArray{T,M,A}, dims::NTuple{N,Int}) where {T,N,M
 end
 
 # create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
-@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A}, ::Type{T},
+@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A,I}, ::Type{T},
                                 osize::Dims{N}) where {T, N, A}
-  return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
+  return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
 end
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
@@ -16,7 +16,8 @@ generator function will be called dynamically.
     # NOTE: this relies on const-prop to forward the literal length to the generator.
     #       maybe we should include the size in the type, like StaticArrays does?
     ptr = emit_shmem(T, Val(len))
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))
 
@@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
         end
     end
     ptr = emit_shmem(T) + offset
-    CuDeviceArray{T,N,AS.Shared}(ptr, dims)
+    # XXX: 4GB ought to be enough shared memory for anybody
+    CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
 end
 Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
     CuDynamicSharedArray(T, (len,), offset)

diff --git a/src/device/random.jl b/src/device/random.jl
@@ -22,7 +22,7 @@ import RandomNumbers
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 # array with per-warp counters, incremented when generating numbers
@@ -36,7 +36,7 @@ end
              }
              attributes #0 = { alwaysinline }
           """, "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
-    CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
+    CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
 end
 
 # initialization function, called automatically at the start of each kernel because
@@ -192,7 +192,7 @@ end
 for var in [:ki, :wi, :fi, :ke, :we, :fe]
     val = getfield(Random, var)
     gpu_var = Symbol("gpu_$var")
-    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
+    arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
     @eval @inline @generated function $gpu_var()
         ptr = emit_constant_array($(QuoteNode(var)), $val)
         Expr(:call, $arr_typ, ptr, $(size(val)))

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -153,7 +153,7 @@ end
         return
     end
 
-    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
+    asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
     @test !occursin(".local", asm)
 end
 

diff --git a/test/core/device/intrinsics/math.jl b/test/core/device/intrinsics/math.jl
@@ -143,7 +143,7 @@ using SpecialFunctions
             @inbounds b[], c[] = @fastmath sincos(a[])
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
         @assert contains(asm, "sin.approx.f32")
         @assert contains(asm, "cos.approx.f32")
         @assert !contains(asm, "__nv")  # from libdevice

diff --git a/test/core/device/intrinsics/wmma.jl b/test/core/device/intrinsics/wmma.jl
@@ -344,7 +344,7 @@ end
             return
         end
 
-        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
+        ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))
 
         @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
         @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
-Original file line number
+Diff line change
@@ Expand Up / @@ -153,7 +153,7 @@ end @@
             return
         end
-        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
         @test !occursin(".local", asm)
     end
@@ Expand Down @@