diff --git a/Project.toml b/Project.toml index 1907f17205..7a5cc0a0f5 100644 --- a/Project.toml +++ b/Project.toml @@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -65,6 +66,7 @@ Libdl = "1" LinearAlgebra = "1" Logging = "1" NVTX = "0.3.2" +PrecompileTools = "1.2.1" Preferences = "1" PrettyTables = "2" Printf = "1" diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index c81aa48421..0874adf770 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -151,7 +151,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16) end intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;" - @eval @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) = + @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) = @asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val) end @@ -172,7 +172,7 @@ for A in (AS.Generic, AS.Global, AS.Shared) nb = sizeof(T)*8 fn = Symbol("atomic_$(op)!") intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb" - @eval @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) = + @eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) = @typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val) end end @@ -192,7 +192,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,) end intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;" - @eval @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) = + @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) = @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val) end diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl index 4d87c7070e..ab8ece277c 100644 --- a/src/device/intrinsics/cooperative_groups.jl +++ b/src/device/intrinsics/cooperative_groups.jl @@ -24,7 +24,7 @@ Noteworthy missing functionality: module CG using ..CUDA -using ..CUDA: i32, Aligned, alignment +using ..CUDA: i32, Aligned, alignment, @device_function using ..LLVM.Interop using ..LLVMLoopInfo @@ -70,7 +70,7 @@ const grid_workspace = Ptr{grid_workspace_st} end end -function get_grid_workspace() +@device_function function get_grid_workspace() # interpret the address from envreg 1 and 2 as the driver's grid workspace hi = ccall("llvm.nvvm.read.ptx.sreg.envreg1", llvmcall, UInt32, ()) lo = ccall("llvm.nvvm.read.ptx.sreg.envreg2", llvmcall, UInt32, ()) @@ -370,7 +370,7 @@ end return oldArrive end -@inline function barrier_wait(gg::grid_group, token) +@device_function @inline function barrier_wait(gg::grid_group, token) arrived = gg.details.barrier if is_cta_master() @@ -548,11 +548,12 @@ end ## pipeline operations -pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ()) +@device_function pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ()) -pipeline_wait_prior(n) = +@device_function pipeline_wait_prior(n) = ccall("llvm.nvvm.cp.async.wait.group", llvmcall, Cvoid, (Int32,), n) +# TODO device function? @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where T size_and_align = sizeof(T) size_and_align in (4, 8, 16) || :(return error($"Unsupported size $size_and_align")) diff --git a/src/device/intrinsics/misc.jl b/src/device/intrinsics/misc.jl index ff8bfaaeea..bd1219a053 100644 --- a/src/device/intrinsics/misc.jl +++ b/src/device/intrinsics/misc.jl @@ -1,5 +1,6 @@ export clock, nanosleep +@device_functions begin """ exit() @@ -34,3 +35,5 @@ Puts a thread for a given amount `t`(in nanoseconds). @asmcall("nanosleep.u32 \$0;", "r", true, Cvoid, Tuple{UInt32}, convert(UInt32, t)) end + +end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index bb06e9dd09..7d1070ff51 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -6,6 +6,7 @@ export sync_threads, sync_warp export sync_threads_count, sync_threads_and, sync_threads_or +@device_functions begin """ sync_threads() @@ -64,7 +65,7 @@ the warp. export barrier_sync -barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) +@inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) ## memory barriers (membar) @@ -107,3 +108,5 @@ host threads, and all threads in peer devices as occurring before all writes to memory made by the calling thread after the call to `threadfence_system()`. """ @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) + +end diff --git a/src/device/intrinsics/version.jl b/src/device/intrinsics/version.jl index ac74dff779..714ec2d88b 100644 --- a/src/device/intrinsics/version.jl +++ b/src/device/intrinsics/version.jl @@ -50,7 +50,7 @@ end export compute_capability, ptx_isa_version for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"] - @eval @inline $(Symbol(var))() = + @eval @device_function @inline $(Symbol(var))() = Base.llvmcall( $("""@$var = external global i32 define i32 @entry() #0 { diff --git a/src/device/intrinsics/warp.jl b/src/device/intrinsics/warp.jl index 91f16cc75c..63913c5aa7 100644 --- a/src/device/intrinsics/warp.jl +++ b/src/device/intrinsics/warp.jl @@ -26,7 +26,7 @@ for (name, mode, mask, offset) in (("_up", :up, UInt32(0x00), src->src), for (T,typ) in ((Int32, "i32"), (UInt32, "i32"), (Float32, "f32")) intrinsic = "llvm.nvvm.shfl.sync.$mode.$typ" @eval begin - @inline $fname(mask, val::$T, src, width=$ws) = + @device_function @inline $fname(mask, val::$T, src, width=$ws) = ccall($intrinsic, llvmcall, $T, (UInt32, $T, UInt32, UInt32), mask, val, $(offset(:src)), pack(width, $mask)) @@ -109,7 +109,7 @@ for mode in (:all, :any, :uni) @eval export $fname intrinsic = "llvm.nvvm.vote.$mode.sync" - @eval @inline $fname(mask, pred) = + @eval @device_function @inline $fname(mask, pred) = @typed_ccall($intrinsic, llvmcall, Bool, (UInt32, Bool), mask, pred) end @@ -119,7 +119,7 @@ for mode in (:ballot, ) @eval export $fname intrinsic = "llvm.nvvm.vote.$mode.sync" - @eval @inline $fname(mask, pred) = + @eval @device_function @inline $fname(mask, pred) = @typed_ccall($intrinsic, llvmcall, UInt32, (UInt32, Bool), mask, pred) end diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl index fe8415af40..a7d449cada 100644 --- a/src/device/intrinsics/wmma.jl +++ b/src/device/intrinsics/wmma.jl @@ -1,7 +1,7 @@ export WMMA module WMMA -using ..CUDA: AS +using ..CUDA: AS, @device_function using Core: LLVMPtr ################################################################################ @@ -196,10 +196,10 @@ for ops in all_ldst_ops, ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int}) if sz == 1 - @eval $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride)) + @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride)) else struct_ty = Symbol("LLVMStruct$sz") - @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) + @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) end @eval export $func_name @eval @doc (@doc llvm_wmma_load) $func_name @@ -263,7 +263,7 @@ export llvm_wmma_store ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int}) - @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) + @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) @eval export $func_name @eval @doc (@doc llvm_wmma_store) $func_name end @@ -340,10 +340,10 @@ for ops in all_wmma_ops, c_vars = ntuple(i -> :(c[$i]), c_sz) if d_sz == 1 - @eval $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) + @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) else struct_ty = Symbol("LLVMStruct$d_sz") - @eval $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) + @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) end @eval export $func_name @eval @doc (@doc llvm_wmma_mma) $func_name diff --git a/src/device/utils.jl b/src/device/utils.jl index c6463360ca..d30e1bdffe 100644 --- a/src/device/utils.jl +++ b/src/device/utils.jl @@ -16,7 +16,7 @@ end macro device_override(ex) ex = macroexpand(__module__, ex) esc(quote - Base.Experimental.@overlay(CUDA.method_table, $ex) + Base.Experimental.@overlay($(CUDA).method_table, $ex) end) end @@ -31,7 +31,7 @@ macro device_function(ex) esc(quote $(combinedef(def)) - @device_override $ex + $(CUDA).@device_override $ex end) end @@ -47,7 +47,7 @@ macro device_functions(ex) push!(out.args, rewrite(arg)) elseif Meta.isexpr(arg, [:function, :(=)]) # rewrite function definitions - push!(out.args, :(@device_function $arg)) + push!(out.args, :($(CUDA).@device_function $arg)) else # preserve all the rest push!(out.args, arg) diff --git a/src/precompile.jl b/src/precompile.jl index dc1b2ce2ec..6f5a46ff04 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,)) precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) + +using PrecompileTools: @setup_workload, @compile_workload +@static if VERSION >= v"1.11.0-DEV.1603" +@setup_workload let + @compile_workload begin + target = PTXCompilerTarget(; cap=v"7.5") + params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5") + config = CompilerConfig(target, params) + mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing}) + job = CompilerJob(mi, config) + GPUCompiler.code_native(devnull, job) + end +end +end