From fa168ef2695cf3c0e69b6ef54d91d62fcb126ae6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 15 Apr 2024 13:24:31 -0400 Subject: [PATCH 1/8] Use PrecompileTools to warmup CUDA.jl --- Project.toml | 2 ++ src/precompile.jl | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/Project.toml b/Project.toml index db564395a7..47217fe272 100644 --- a/Project.toml +++ b/Project.toml @@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -65,6 +66,7 @@ Libdl = "1" LinearAlgebra = "1" Logging = "1" NVTX = "0.3.2" +PrecompileTools = "1.2.1" Preferences = "1" PrettyTables = "2" Printf = "1" diff --git a/src/precompile.jl b/src/precompile.jl index dc1b2ce2ec..8cf8223f51 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,)) precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) + +@static if VERSION >= v"1.11.-" +using PrecompileTools: @setup_workload, @compile_workload +@setup_workload let + @compile_workload begin + target = PTXCompilerTarget(; cap=v"7.5") + params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5") + config = CompilerConfig(target, params) + mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing}) + job = CompilerJob(mi, config) + GPUCompiler.code_native(devnull, job) + end +end +end From d36c3022a4cefdb555ef309a62bc89e9c206ec83 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:50:20 -0400 Subject: [PATCH 2/8] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index 8cf8223f51..d765847840 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,7 +15,7 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.-" +@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload @setup_workload let @compile_workload begin From 80ec8695576b8b5ef24df67d2a0b7f6df1f09828 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:51:41 -0400 Subject: [PATCH 3/8] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index d765847840..6f5a46ff04 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,8 +15,8 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload +@static if VERSION >= v"1.11.0-DEV.1603" @setup_workload let @compile_workload begin target = PTXCompilerTarget(; cap=v"7.5") From 69142c0a01894a26a4baeaa79f90decd1cfb2f32 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 15 Apr 2024 13:24:31 -0400 Subject: [PATCH 4/8] Use PrecompileTools to warmup CUDA.jl --- Project.toml | 2 ++ src/precompile.jl | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/Project.toml b/Project.toml index 1907f17205..7a5cc0a0f5 100644 --- a/Project.toml +++ b/Project.toml @@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" +PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Preferences = "21216c6a-2e73-6563-6e65-726566657250" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" @@ -65,6 +66,7 @@ Libdl = "1" LinearAlgebra = "1" Logging = "1" NVTX = "0.3.2" +PrecompileTools = "1.2.1" Preferences = "1" PrettyTables = "2" Printf = "1" diff --git a/src/precompile.jl b/src/precompile.jl index dc1b2ce2ec..8cf8223f51 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,)) precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) + +@static if VERSION >= v"1.11.-" +using PrecompileTools: @setup_workload, @compile_workload +@setup_workload let + @compile_workload begin + target = PTXCompilerTarget(; cap=v"7.5") + params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5") + config = CompilerConfig(target, params) + mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing}) + job = CompilerJob(mi, config) + GPUCompiler.code_native(devnull, job) + end +end +end From d27e5eb59cc3e56611a1250ff6eb2dbbb2cc1f6e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:50:20 -0400 Subject: [PATCH 5/8] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index 8cf8223f51..d765847840 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,7 +15,7 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.-" +@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload @setup_workload let @compile_workload begin From c7f880cf9b72570673528771e963814fd6a0f131 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 17 Apr 2024 16:51:41 -0400 Subject: [PATCH 6/8] fixup! Use PrecompileTools to warmup CUDA.jl --- src/precompile.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/precompile.jl b/src/precompile.jl index d765847840..6f5a46ff04 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -15,8 +15,8 @@ precompile(cudaconvert, (Function,)) precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}})) precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction)) -@static if VERSION >= v"1.11.0-DEV.1603" using PrecompileTools: @setup_workload, @compile_workload +@static if VERSION >= v"1.11.0-DEV.1603" @setup_workload let @compile_workload begin target = PTXCompilerTarget(; cap=v"7.5") From 09efa88cfcc415ad0d5ded0f1ce98d766f1a1ca2 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 19 Apr 2024 14:19:02 -0400 Subject: [PATCH 7/8] Mark more methods as device methods --- src/device/intrinsics/atomics.jl | 6 +++--- src/device/intrinsics/cooperative_groups.jl | 9 +++++---- src/device/intrinsics/indexing.jl | 8 ++++---- src/device/intrinsics/misc.jl | 8 ++++---- src/device/intrinsics/synchronization.jl | 18 +++++++++--------- src/device/intrinsics/version.jl | 2 +- src/device/intrinsics/warp.jl | 6 +++--- src/device/intrinsics/wmma.jl | 10 +++++----- 8 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl index c81aa48421..0874adf770 100644 --- a/src/device/intrinsics/atomics.jl +++ b/src/device/intrinsics/atomics.jl @@ -151,7 +151,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16) end intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;" - @eval @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) = + @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) = @asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val) end @@ -172,7 +172,7 @@ for A in (AS.Generic, AS.Global, AS.Shared) nb = sizeof(T)*8 fn = Symbol("atomic_$(op)!") intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb" - @eval @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) = + @eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) = @typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val) end end @@ -192,7 +192,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,) end intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;" - @eval @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) = + @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) = @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val) end diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl index 4d87c7070e..deddac3cb5 100644 --- a/src/device/intrinsics/cooperative_groups.jl +++ b/src/device/intrinsics/cooperative_groups.jl @@ -70,7 +70,7 @@ const grid_workspace = Ptr{grid_workspace_st} end end -function get_grid_workspace() +@device_function function get_grid_workspace() # interpret the address from envreg 1 and 2 as the driver's grid workspace hi = ccall("llvm.nvvm.read.ptx.sreg.envreg1", llvmcall, UInt32, ()) lo = ccall("llvm.nvvm.read.ptx.sreg.envreg2", llvmcall, UInt32, ()) @@ -370,7 +370,7 @@ end return oldArrive end -@inline function barrier_wait(gg::grid_group, token) +@device_function @inline function barrier_wait(gg::grid_group, token) arrived = gg.details.barrier if is_cta_master() @@ -548,11 +548,12 @@ end ## pipeline operations -pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ()) +@device_function pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ()) -pipeline_wait_prior(n) = +@device_function pipeline_wait_prior(n) = ccall("llvm.nvvm.cp.async.wait.group", llvmcall, Cvoid, (Int32,), n) +# TODO device function? @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where T size_and_align = sizeof(T) size_and_align in (4, 8, 16) || :(return error($"Unsupported size $size_and_align")) diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl index 7677c553f5..e03568ffdb 100644 --- a/src/device/intrinsics/indexing.jl +++ b/src/device/intrinsics/indexing.jl @@ -96,14 +96,14 @@ Returns the thread index within the block. Returns the warp size (in threads). """ -@inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ()) +@device_function @inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ()) """ laneid()::Int32 Returns the thread's lane within the warp. """ -@inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32 +@device_function @inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32 """ lanemask(pred)::UInt32 @@ -111,7 +111,7 @@ Returns the thread's lane within the warp. Returns a 32-bit mask indicating which threads in a warp satisfy the given predicate. Supported predicates are `==`, `<`, `<=`, `>=`, and `>`. """ -@inline function lanemask(pred::F) where F +@device_function @inline function lanemask(pred::F) where F if pred === Base.:(==) ccall("llvm.nvvm.read.ptx.sreg.lanemask.eq", llvmcall, UInt32, ()) elseif pred === Base.:(<) @@ -133,7 +133,7 @@ end Returns a 32-bit mask indicating which threads in a warp are active with the current executing thread. """ -@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{}) +@device_function @inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{}) end diff --git a/src/device/intrinsics/misc.jl b/src/device/intrinsics/misc.jl index ff8bfaaeea..8184ee1d67 100644 --- a/src/device/intrinsics/misc.jl +++ b/src/device/intrinsics/misc.jl @@ -5,21 +5,21 @@ export clock, nanosleep Terminate a thread. """ -exit() = @asmcall("exit;") +@device_function exit() = @asmcall("exit;") """ clock(UInt32) Returns the value of a per-multiprocessor counter that is incremented every clock cycle. """ -clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) +@device_function clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) """ clock(UInt64) Returns the value of a per-multiprocessor counter that is incremented every clock cycle. """ -clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) +@device_function clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) """ @@ -30,7 +30,7 @@ Puts a thread for a given amount `t`(in nanoseconds). !!! note Requires CUDA >= 10.0 and sm_6.2 """ -@inline function nanosleep(t::Unsigned) +@device_function @inline function nanosleep(t::Unsigned) @asmcall("nanosleep.u32 \$0;", "r", true, Cvoid, Tuple{UInt32}, convert(UInt32, t)) end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index bb06e9dd09..4e25e94366 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -13,7 +13,7 @@ Waits until all threads in the thread block have reached this point and all glob shared memory accesses made by these threads prior to `sync_threads()` are visible to all threads in the block. """ -@inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ()) +@device_function @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ()) """ sync_threads_count(predicate) @@ -22,7 +22,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns the number of threads for which `predicate` evaluates to true. """ -@inline sync_threads_count(predicate) = +@device_function @inline sync_threads_count(predicate) = ccall("llvm.nvvm.barrier0.popc", llvmcall, Int32, (Int32,), predicate) """ @@ -32,7 +32,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns `true` if and only if `predicate` evaluates to `true` for all of them. """ -@inline sync_threads_and(predicate) = +@device_function @inline sync_threads_and(predicate) = ccall("llvm.nvvm.barrier0.and", llvmcall, Int32, (Int32,), predicate) != Int32(0) """ @@ -42,7 +42,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns `true` if and only if `predicate` evaluates to `true` for any of them. """ -@inline sync_threads_or(predicate) = +@device_function @inline sync_threads_or(predicate) = ccall("llvm.nvvm.barrier0.or", llvmcall, Int32, (Int32,), predicate) != Int32(0) """ @@ -56,7 +56,7 @@ the warp. !!! note Requires CUDA >= 9.0 and sm_6.2 """ -@inline sync_warp(mask=FULL_MASK) = +@device_function @inline sync_warp(mask=FULL_MASK) = ccall("llvm.nvvm.bar.warp.sync", llvmcall, Cvoid, (UInt32,), mask) @@ -64,7 +64,7 @@ the warp. export barrier_sync -barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) +@device_function @inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) ## memory barriers (membar) @@ -81,7 +81,7 @@ A memory fence that ensures that: - All reads from all memory made by the calling thread before the call to `threadfence_block()` are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`. """ -@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +@device_function @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) """ threadfence() @@ -95,7 +95,7 @@ Note that for this ordering guarantee to be true, the observing threads must tru memory and not cached versions of it; this is requires the use of volatile loads and stores, which is not available from Julia right now. """ -@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +@device_function @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) """ threadfence_system() @@ -106,4 +106,4 @@ before the call to `threadfence_system()` are observed by all threads in the dev host threads, and all threads in peer devices as occurring before all writes to all memory made by the calling thread after the call to `threadfence_system()`. """ -@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) +@device_function @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) diff --git a/src/device/intrinsics/version.jl b/src/device/intrinsics/version.jl index ac74dff779..714ec2d88b 100644 --- a/src/device/intrinsics/version.jl +++ b/src/device/intrinsics/version.jl @@ -50,7 +50,7 @@ end export compute_capability, ptx_isa_version for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"] - @eval @inline $(Symbol(var))() = + @eval @device_function @inline $(Symbol(var))() = Base.llvmcall( $("""@$var = external global i32 define i32 @entry() #0 { diff --git a/src/device/intrinsics/warp.jl b/src/device/intrinsics/warp.jl index 91f16cc75c..63913c5aa7 100644 --- a/src/device/intrinsics/warp.jl +++ b/src/device/intrinsics/warp.jl @@ -26,7 +26,7 @@ for (name, mode, mask, offset) in (("_up", :up, UInt32(0x00), src->src), for (T,typ) in ((Int32, "i32"), (UInt32, "i32"), (Float32, "f32")) intrinsic = "llvm.nvvm.shfl.sync.$mode.$typ" @eval begin - @inline $fname(mask, val::$T, src, width=$ws) = + @device_function @inline $fname(mask, val::$T, src, width=$ws) = ccall($intrinsic, llvmcall, $T, (UInt32, $T, UInt32, UInt32), mask, val, $(offset(:src)), pack(width, $mask)) @@ -109,7 +109,7 @@ for mode in (:all, :any, :uni) @eval export $fname intrinsic = "llvm.nvvm.vote.$mode.sync" - @eval @inline $fname(mask, pred) = + @eval @device_function @inline $fname(mask, pred) = @typed_ccall($intrinsic, llvmcall, Bool, (UInt32, Bool), mask, pred) end @@ -119,7 +119,7 @@ for mode in (:ballot, ) @eval export $fname intrinsic = "llvm.nvvm.vote.$mode.sync" - @eval @inline $fname(mask, pred) = + @eval @device_function @inline $fname(mask, pred) = @typed_ccall($intrinsic, llvmcall, UInt32, (UInt32, Bool), mask, pred) end diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl index fe8415af40..0f30c32031 100644 --- a/src/device/intrinsics/wmma.jl +++ b/src/device/intrinsics/wmma.jl @@ -196,10 +196,10 @@ for ops in all_ldst_ops, ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int}) if sz == 1 - @eval $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride)) + @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride)) else struct_ty = Symbol("LLVMStruct$sz") - @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) + @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) end @eval export $func_name @eval @doc (@doc llvm_wmma_load) $func_name @@ -263,7 +263,7 @@ export llvm_wmma_store ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int}) - @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) + @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) @eval export $func_name @eval @doc (@doc llvm_wmma_store) $func_name end @@ -340,10 +340,10 @@ for ops in all_wmma_ops, c_vars = ntuple(i -> :(c[$i]), c_sz) if d_sz == 1 - @eval $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) + @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) else struct_ty = Symbol("LLVMStruct$d_sz") - @eval $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) + @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...))) end @eval export $func_name @eval @doc (@doc llvm_wmma_mma) $func_name From 36c125d3332f8c21e3730f07c43ae2cdd1101bd6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 19 Apr 2024 15:07:57 -0400 Subject: [PATCH 8/8] fixup! Mark more methods as device methods --- src/device/intrinsics/cooperative_groups.jl | 2 +- src/device/intrinsics/indexing.jl | 8 ++++---- src/device/intrinsics/misc.jl | 11 +++++++---- src/device/intrinsics/synchronization.jl | 21 ++++++++++++--------- src/device/intrinsics/wmma.jl | 2 +- src/device/utils.jl | 6 +++--- 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl index deddac3cb5..ab8ece277c 100644 --- a/src/device/intrinsics/cooperative_groups.jl +++ b/src/device/intrinsics/cooperative_groups.jl @@ -24,7 +24,7 @@ Noteworthy missing functionality: module CG using ..CUDA -using ..CUDA: i32, Aligned, alignment +using ..CUDA: i32, Aligned, alignment, @device_function using ..LLVM.Interop using ..LLVMLoopInfo diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl index e03568ffdb..7677c553f5 100644 --- a/src/device/intrinsics/indexing.jl +++ b/src/device/intrinsics/indexing.jl @@ -96,14 +96,14 @@ Returns the thread index within the block. Returns the warp size (in threads). """ -@device_function @inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ()) +@inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ()) """ laneid()::Int32 Returns the thread's lane within the warp. """ -@device_function @inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32 +@inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32 """ lanemask(pred)::UInt32 @@ -111,7 +111,7 @@ Returns the thread's lane within the warp. Returns a 32-bit mask indicating which threads in a warp satisfy the given predicate. Supported predicates are `==`, `<`, `<=`, `>=`, and `>`. """ -@device_function @inline function lanemask(pred::F) where F +@inline function lanemask(pred::F) where F if pred === Base.:(==) ccall("llvm.nvvm.read.ptx.sreg.lanemask.eq", llvmcall, UInt32, ()) elseif pred === Base.:(<) @@ -133,7 +133,7 @@ end Returns a 32-bit mask indicating which threads in a warp are active with the current executing thread. """ -@device_function @inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{}) +@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{}) end diff --git a/src/device/intrinsics/misc.jl b/src/device/intrinsics/misc.jl index 8184ee1d67..bd1219a053 100644 --- a/src/device/intrinsics/misc.jl +++ b/src/device/intrinsics/misc.jl @@ -1,25 +1,26 @@ export clock, nanosleep +@device_functions begin """ exit() Terminate a thread. """ -@device_function exit() = @asmcall("exit;") +exit() = @asmcall("exit;") """ clock(UInt32) Returns the value of a per-multiprocessor counter that is incremented every clock cycle. """ -@device_function clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) +clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ()) """ clock(UInt64) Returns the value of a per-multiprocessor counter that is incremented every clock cycle. """ -@device_function clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) +clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ()) """ @@ -30,7 +31,9 @@ Puts a thread for a given amount `t`(in nanoseconds). !!! note Requires CUDA >= 10.0 and sm_6.2 """ -@device_function @inline function nanosleep(t::Unsigned) +@inline function nanosleep(t::Unsigned) @asmcall("nanosleep.u32 \$0;", "r", true, Cvoid, Tuple{UInt32}, convert(UInt32, t)) end + +end diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl index 4e25e94366..7d1070ff51 100644 --- a/src/device/intrinsics/synchronization.jl +++ b/src/device/intrinsics/synchronization.jl @@ -6,6 +6,7 @@ export sync_threads, sync_warp export sync_threads_count, sync_threads_and, sync_threads_or +@device_functions begin """ sync_threads() @@ -13,7 +14,7 @@ Waits until all threads in the thread block have reached this point and all glob shared memory accesses made by these threads prior to `sync_threads()` are visible to all threads in the block. """ -@device_function @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ()) +@inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ()) """ sync_threads_count(predicate) @@ -22,7 +23,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns the number of threads for which `predicate` evaluates to true. """ -@device_function @inline sync_threads_count(predicate) = +@inline sync_threads_count(predicate) = ccall("llvm.nvvm.barrier0.popc", llvmcall, Int32, (Int32,), predicate) """ @@ -32,7 +33,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns `true` if and only if `predicate` evaluates to `true` for all of them. """ -@device_function @inline sync_threads_and(predicate) = +@inline sync_threads_and(predicate) = ccall("llvm.nvvm.barrier0.and", llvmcall, Int32, (Int32,), predicate) != Int32(0) """ @@ -42,7 +43,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred all threads of the block and returns `true` if and only if `predicate` evaluates to `true` for any of them. """ -@device_function @inline sync_threads_or(predicate) = +@inline sync_threads_or(predicate) = ccall("llvm.nvvm.barrier0.or", llvmcall, Int32, (Int32,), predicate) != Int32(0) """ @@ -56,7 +57,7 @@ the warp. !!! note Requires CUDA >= 9.0 and sm_6.2 """ -@device_function @inline sync_warp(mask=FULL_MASK) = +@inline sync_warp(mask=FULL_MASK) = ccall("llvm.nvvm.bar.warp.sync", llvmcall, Cvoid, (UInt32,), mask) @@ -64,7 +65,7 @@ the warp. export barrier_sync -@device_function @inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) +@inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id) ## memory barriers (membar) @@ -81,7 +82,7 @@ A memory fence that ensures that: - All reads from all memory made by the calling thread before the call to `threadfence_block()` are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`. """ -@device_function @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) +@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ()) """ threadfence() @@ -95,7 +96,7 @@ Note that for this ordering guarantee to be true, the observing threads must tru memory and not cached versions of it; this is requires the use of volatile loads and stores, which is not available from Julia right now. """ -@device_function @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) +@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ()) """ threadfence_system() @@ -106,4 +107,6 @@ before the call to `threadfence_system()` are observed by all threads in the dev host threads, and all threads in peer devices as occurring before all writes to all memory made by the calling thread after the call to `threadfence_system()`. """ -@device_function @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) +@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ()) + +end diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl index 0f30c32031..a7d449cada 100644 --- a/src/device/intrinsics/wmma.jl +++ b/src/device/intrinsics/wmma.jl @@ -1,7 +1,7 @@ export WMMA module WMMA -using ..CUDA: AS +using ..CUDA: AS, @device_function using Core: LLVMPtr ################################################################################ diff --git a/src/device/utils.jl b/src/device/utils.jl index c6463360ca..d30e1bdffe 100644 --- a/src/device/utils.jl +++ b/src/device/utils.jl @@ -16,7 +16,7 @@ end macro device_override(ex) ex = macroexpand(__module__, ex) esc(quote - Base.Experimental.@overlay(CUDA.method_table, $ex) + Base.Experimental.@overlay($(CUDA).method_table, $ex) end) end @@ -31,7 +31,7 @@ macro device_function(ex) esc(quote $(combinedef(def)) - @device_override $ex + $(CUDA).@device_override $ex end) end @@ -47,7 +47,7 @@ macro device_functions(ex) push!(out.args, rewrite(arg)) elseif Meta.isexpr(arg, [:function, :(=)]) # rewrite function definitions - push!(out.args, :(@device_function $arg)) + push!(out.args, :($(CUDA).@device_function $arg)) else # preserve all the rest push!(out.args, arg)