Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add an index typevar to CuDeviceArray. #1895

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions lib/cusparse/device.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ using SparseArrays
export CuSparseDeviceVector, CuSparseDeviceMatrixCSC, CuSparseDeviceMatrixCSR,
CuSparseDeviceMatrixBSR, CuSparseDeviceMatrixCOO

struct CuSparseDeviceVector{Tv,Ti, A} <: AbstractSparseVector{Tv,Ti}
iPtr::CuDeviceVector{Ti, A}
nzVal::CuDeviceVector{Tv, A}
struct CuSparseDeviceVector{Tv,Ti,A} <: AbstractSparseVector{Tv,Ti}
iPtr::CuDeviceVector{Ti,A,Ti}
nzVal::CuDeviceVector{Tv,A,Ti}
len::Int
nnz::Ti
end
Expand All @@ -23,9 +23,9 @@ Base.size(g::CuSparseDeviceVector) = (g.len,)
SparseArrays.nnz(g::CuSparseDeviceVector) = g.nnz

struct CuSparseDeviceMatrixCSC{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
colPtr::CuDeviceVector{Ti, A}
rowVal::CuDeviceVector{Ti, A}
nzVal::CuDeviceVector{Tv, A}
colPtr::CuDeviceVector{Ti,A,Ti}
rowVal::CuDeviceVector{Ti,A,Ti}
nzVal::CuDeviceVector{Tv,A,Ti}
dims::NTuple{2,Int}
nnz::Ti
end
Expand All @@ -35,10 +35,10 @@ Base.size(g::CuSparseDeviceMatrixCSC) = g.dims
SparseArrays.nnz(g::CuSparseDeviceMatrixCSC) = g.nnz

struct CuSparseDeviceMatrixCSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
rowPtr::CuDeviceVector{Ti, A}
colVal::CuDeviceVector{Ti, A}
nzVal::CuDeviceVector{Tv, A}
dims::NTuple{2, Int}
rowPtr::CuDeviceVector{Ti,A,Ti}
colVal::CuDeviceVector{Ti,A,Ti}
nzVal::CuDeviceVector{Tv,A,Ti}
dims::NTuple{2,Int}
nnz::Ti
end

Expand All @@ -47,9 +47,9 @@ Base.size(g::CuSparseDeviceMatrixCSR) = g.dims
SparseArrays.nnz(g::CuSparseDeviceMatrixCSR) = g.nnz

struct CuSparseDeviceMatrixBSR{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
rowPtr::CuDeviceVector{Ti, A}
colVal::CuDeviceVector{Ti, A}
nzVal::CuDeviceVector{Tv, A}
rowPtr::CuDeviceVector{Ti,A,Ti}
colVal::CuDeviceVector{Ti,A,Ti}
nzVal::CuDeviceVector{Tv,A,Ti}
dims::NTuple{2,Int}
blockDim::Ti
dir::Char
Expand All @@ -61,9 +61,9 @@ Base.size(g::CuSparseDeviceMatrixBSR) = g.dims
SparseArrays.nnz(g::CuSparseDeviceMatrixBSR) = g.nnz

struct CuSparseDeviceMatrixCOO{Tv,Ti,A} <: AbstractSparseMatrix{Tv,Ti}
rowInd::CuDeviceVector{Ti, A}
colInd::CuDeviceVector{Ti, A}
nzVal::CuDeviceVector{Tv, A}
rowInd::CuDeviceVector{Ti,A,Ti}
colInd::CuDeviceVector{Ti,A,Ti}
nzVal::CuDeviceVector{Tv,A,Ti}
dims::NTuple{2,Int}
nnz::Ti
end
Expand Down
11 changes: 11 additions & 0 deletions src/broadcast.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,14 @@ BroadcastStyle(::CUDA.CuArrayStyle{N, B1},
# allocation of output arrays
Base.similar(bc::Broadcasted{CuArrayStyle{N,B}}, ::Type{T}, dims) where {T,N,B} =
similar(CuArray{T,length(dims),B}, dims)

# Base.Broadcast can't handle Int32 axes
# XXX: not using a quirk, as constprop/irinterpret is crucial here
# XXX: 1.11 uses to_index i nstead of CartesianIndex
Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::CartesianIndex) = CartesianIndex(_newindex(axes(arg), I.I))
Base.@propagate_inbounds Broadcast.newindex(arg::AnyCuDeviceArray, I::Integer) = CartesianIndex(_newindex(axes(arg), (I,)))
Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple) = # XXX: upstream this?
(ifelse(length(ax[1]) == 1, promote(ax[1][1], I[1])...), _newindex(Base.tail(ax), Base.tail(I))...)
Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple) = ()
Base.@propagate_inbounds _newindex(ax::Tuple, I::Tuple{}) = (ax[1][1], _newindex(Base.tail(ax), ())...)
Base.@propagate_inbounds _newindex(ax::Tuple{}, I::Tuple{}) = ()
61 changes: 37 additions & 24 deletions src/device/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,49 @@ export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg
## construction

"""
CuDeviceArray{T,N,A}(ptr, dims, [maxsize])
CuDeviceArray{T,N,A,I}(ptr, dims, [maxsize])

Construct an `N`-dimensional dense CUDA device array with element type `T` wrapping a
pointer, where `N` is determined from the length of `dims` and `T` is determined from the
type of `ptr`. `dims` may be a single scalar, or a tuple of integers corresponding to the
lengths in each dimension). If the rank `N` is supplied explicitly as in `Array{T,N}(dims)`,
then it must match the length of `dims`. The same applies to the element type `T`, which
should match the type of the pointer `ptr`.
pointer `ptr` in address space `A`. `dims` should be a tuple of `N` integers corresponding
to the lengths in each dimension. `maxsize` is the maximum number of bytes that can be
stored in the array, and is determined automatically if not specified. `I` is the integer
type used to store the size of the array, and is determined automatically if not specified.
"""
CuDeviceArray

# NOTE: we can't support the typical `tuple or series of integer` style construction,
# because we're currently requiring a trailing pointer argument.

struct CuDeviceArray{T,N,A} <: DenseArray{T,N}
struct CuDeviceArray{T,N,A,I} <: DenseArray{T,N}
ptr::LLVMPtr{T,A}
maxsize::Int

dims::Dims{N}
len::Int
maxsize::I

dims::NTuple{N,I}
len::I

# determine an index type based on the size of the array.
# this is type unstable, so only use this constructor from the host side.
function CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N}
if maxsize <= typemax(Int32)
CuDeviceArray{T,N,A,Int32}(ptr, dims, maxsize)
else
CuDeviceArray{T,N,A,Int64}(ptr, dims, maxsize)
end
end

# inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
CuDeviceArray{T,N,A}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Int=prod(dims)*sizeof(T)) where {T,A,N} =
new(ptr, maxsize, dims, prod(dims))
# fully typed, for use in device code
CuDeviceArray{T,N,A,I}(ptr::LLVMPtr{T,A}, dims::Tuple,
maxsize::Integer=prod(dims)*sizeof(T)) where {T,A,N,I} =
new{T,N,A,I}(ptr, convert(I, maxsize), map(I, dims), convert(I, prod(dims)))
end

const CuDeviceVector = CuDeviceArray{T,1,A} where {T,A}
const CuDeviceMatrix = CuDeviceArray{T,2,A} where {T,A}

# anything that's (secretly) backed by a CuArray
const AnyCuDeviceArray{T,N} = Union{CuDeviceArray{T,N}, WrappedArray{T,N,CuDeviceArray,CuDeviceArray{T,N}}}
const AnyCuDeviceVector{T} = AnyCuDeviceArray{T,1}
const AnyCuDeviceMatrix{T} = AnyCuDeviceArray{T,2}
const AnyCuDeviceVecOrMat{T} = Union{AnyCuDeviceVector{T}, AnyCuDeviceMatrix{T}}


## array interface

Expand Down Expand Up @@ -224,18 +237,18 @@ Base.show(io::IO, mime::MIME"text/plain", a::CuDeviceArray) = show(io, a)
end
end

function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A}) where {T,S,N,A}
function Base.reinterpret(::Type{T}, a::CuDeviceArray{S,N,A,I}) where {T,S,N,A,I}
err = GPUArrays._reinterpret_exception(T, a)
err === nothing || throw(err)

if sizeof(T) == sizeof(S) # fast case
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), size(a), a.maxsize)
end

isize = size(a)
size1 = div(isize[1]*sizeof(S), sizeof(T))
osize = tuple(size1, Base.tail(isize)...)
return CuDeviceArray{T,N,A}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
return CuDeviceArray{T,N,A,I}(reinterpret(LLVMPtr{T,A}, a.ptr), osize, a.maxsize)
end


Expand All @@ -252,7 +265,7 @@ function Base.reshape(a::CuDeviceArray{T,M,A}, dims::NTuple{N,Int}) where {T,N,M
end

# create a derived device array (reinterpreted or reshaped) that's still a CuDeviceArray
@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A}, ::Type{T},
osize::Dims{N}) where {T, N, A}
return CuDeviceArray{T,N,A}(a.ptr, osize, a.maxsize)
@inline function _derived_array(a::CuDeviceArray{<:Any,<:Any,A,I}, ::Type{T},
osize::Dims{N}) where {T, N, A, I}
return CuDeviceArray{T,N,A,I}(a.ptr, osize, a.maxsize)
end
6 changes: 4 additions & 2 deletions src/device/intrinsics/shared_memory.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ generator function will be called dynamically.
# NOTE: this relies on const-prop to forward the literal length to the generator.
# maybe we should include the size in the type, like StaticArrays does?
ptr = emit_shmem(T, Val(len))
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
CuStaticSharedArray(::Type{T}, len::Integer) where {T} = CuStaticSharedArray(T, (len,))

Expand Down Expand Up @@ -53,7 +54,8 @@ shared memory; in the case of a homogeneous multi-part buffer it is preferred to
end
end
ptr = emit_shmem(T) + offset
CuDeviceArray{T,N,AS.Shared}(ptr, dims)
# XXX: 4GB ought to be enough shared memory for anybody
CuDeviceArray{T,N,AS.Shared,Int32}(ptr, dims)
end
Base.@propagate_inbounds CuDynamicSharedArray(::Type{T}, len::Integer, offset) where {T} =
CuDynamicSharedArray(T, (len,), offset)
Expand Down
6 changes: 3 additions & 3 deletions src/device/random.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import RandomNumbers
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

# array with per-warp counters, incremented when generating numbers
Expand All @@ -36,7 +36,7 @@ end
}
attributes #0 = { alwaysinline }
""", "entry"), LLVMPtr{UInt32, AS.Shared}, Tuple{})
CuDeviceArray{UInt32,1,AS.Shared}(ptr, (32,))
CuDeviceArray{UInt32,1,AS.Shared,Int32}(ptr, (32,))
end

# initialization function, called automatically at the start of each kernel because
Expand Down Expand Up @@ -204,7 +204,7 @@ end
for var in [:ki, :wi, :fi, :ke, :we, :fe]
val = getfield(Random, var)
gpu_var = Symbol("gpu_$var")
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant})
arr_typ = :(CuDeviceArray{$(eltype(val)),$(ndims(val)),AS.Constant,Int32})
@eval @inline @generated function $gpu_var()
ptr = emit_constant_array($(QuoteNode(var)), $val)
Expr(:call, $arr_typ, ptr, $(size(val)))
Expand Down
2 changes: 1 addition & 1 deletion test/core/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ end
return
end

asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{2,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@test !occursin(".local", asm)
end

Expand Down
2 changes: 1 addition & 1 deletion test/core/device/intrinsics/cooperative_groups.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ if capability(device()) >= v"6.0" && attribute(device(), CUDA.DEVICE_ATTRIBUTE_C
# (the occupancy API could be used to calculate how many blocks can fit per SM,
# but that doesn't matter for the tests, so we assume a single block per SM.)
maxBlocks = attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global}})
kernel = cufunction(kernel_vadd, NTuple{3, CuDeviceArray{Float32,2,AS.Global,Int}})
maxThreads = CUDA.maxthreads(kernel)

a = rand(Float32, maxBlocks, maxThreads)
Expand Down
2 changes: 1 addition & 1 deletion test/core/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ using SpecialFunctions
@inbounds b[], c[] = @fastmath sincos(a[])
return
end
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global,Int32}}))
@assert contains(asm, "sin.approx.f32")
@assert contains(asm, "cos.approx.f32")
@assert !contains(asm, "__nv") # from libdevice
Expand Down
2 changes: 1 addition & 1 deletion test/core/device/intrinsics/wmma.jl
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ end
return
end

ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global},)))
ptx = sprint(io -> CUDA.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDA.AS.Global,Int32},)))

@test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
@test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
Expand Down