Skip to content

Commit

Permalink
Add an index typevar to CuDeviceArray.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Sep 11, 2023
1 parent c97bc77 commit 9b1d55d
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 31 deletions.
53 changes: 30 additions & 23 deletions src/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,38 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
## construction

"""
CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])
Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
pointer, where `N` is determined from the length of `dims` and `T` is determined from the
type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
then it must match the length of `dims`. The same applies to the element type `T`, which
should match the type of the pointer `ptr`.
pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
stored in the array, and is determined automatically if not specified. `I` is the integer
type used to store the size of the array, and is determined automatically if not specified.
"""
CuDeviceArray

# NOTE: we can't support the typical `tuple or series of integer` style construction,
# because we're currently requiring a trailing pointer argument.

struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
ptr::LLVMPtr{T,A}
maxsize::Int

dims::Dims{N}
len::Int
maxsize::I

dims::NTuple{N,I}
len::I

# determine an index type based on the size of the array.
# this is type unstable, so only use this constructor from the host side.
function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
if maxsize <= typemax(Int32)
CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
else
CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
end
end

# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
new(ptr, maxsize, dims, prod(dims))
# fully typed, for use in device code
CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
end

const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
Expand Down Expand Up @@ -224,18 +231,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
end
end

function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
err = GPUArrays._reinterpret_exception(T, a)
err === nothing || throw(err)

if sizeof(T) == sizeof(S) # fast case
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
end

isize = size(a)
size1 = div(isize[1]*sizeof(S), sizeof(T))
osize = tuple(size1, Base.tail(isize)...)
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
end


Expand All @@ -252,7 +259,7 @@ function Base.reshape(a::CuDeviceArray{T,M,A}, dims::NTuple{N,Int}) where {T,N,M
end

# create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A}, ::Type{T},
@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A,I}, ::Type{T},
osize::Dims{N}) where {T, N, A}
return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
end
6 changes: 4 additions & 2 deletions src/device/intrinsics/memory_shared.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ generator function will be called dynamically.
# NOTE: this relies on const-prop to forward the literal length to the generator.
# maybe we should include the size in the type, like StaticArrays does?
ptr = emit_shmem(T, Val(len))
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))

Expand Down Expand Up @@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
end
end
ptr = emit_shmem(T) + offset
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
CuDynamicSharedArray(T, (len,), offset)
Expand Down
6 changes: 3 additions & 3 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import RandomNumbers
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

# array with per-warp counters, incremented when generating numbers
Expand All @@ -36,7 +36,7 @@ end
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

# initialization function, called automatically at the start of each kernel because
Expand Down Expand Up @@ -192,7 +192,7 @@ end
for var in [:ki, :wi, :fi, :ke, :we, :fe]
val = getfield(Random, var)
gpu_var = Symbol("gpu_$var")
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
@eval @inline @generated function $gpu_var()
ptr = emit_constant_array($(QuoteNode(var)), $val)
Expr(:call, $arr_typ, ptr, $(size(val)))
Expand Down
2 changes: 1 addition & 1 deletion test/core/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ end
return
end

asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@test !occursin(".local", asm)
end

Expand Down
2 changes: 1 addition & 1 deletion test/core/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ using SpecialFunctions
@inbounds b[], c[] = @fastmath sincos(a[])
return
end
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@assert contains(asm, "sin.approx.f32")
@assert contains(asm, "cos.approx.f32")
@assert !contains(asm, "__nv") # from libdevice
Expand Down
2 changes: 1 addition & 1 deletion test/core/device/intrinsics/wmma.jl
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ end
return
end

ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))

@test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
@test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
Expand Down

0 comments on commit 9b1d55d

Please sign in to comment.