From fa168ef2695cf3c0e69b6ef54d91d62fcb126ae6 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 15 Apr 2024 13:24:31 -0400
Subject: [PATCH 1/8] Use PrecompileTools to warmup CUDA.jl

---
 Project.toml      |  2 ++
 src/precompile.jl | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/Project.toml b/Project.toml
index db564395a7..47217fe272 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -65,6 +66,7 @@ Libdl = "1"
 LinearAlgebra = "1"
 Logging = "1"
 NVTX = "0.3.2"
+PrecompileTools = "1.2.1"
 Preferences = "1"
 PrettyTables = "2"
 Printf = "1"
diff --git a/src/precompile.jl b/src/precompile.jl
index dc1b2ce2ec..8cf8223f51 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,))
 precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
+
+@static if VERSION >= v"1.11.-"
+using PrecompileTools: @setup_workload, @compile_workload
+@setup_workload let
+    @compile_workload begin
+        target = PTXCompilerTarget(; cap=v"7.5")
+        params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5")
+        config = CompilerConfig(target, params)
+        mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing})
+        job = CompilerJob(mi, config)
+        GPUCompiler.code_native(devnull, job)
+    end
+end
+end

From d36c3022a4cefdb555ef309a62bc89e9c206ec83 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 17 Apr 2024 16:50:20 -0400
Subject: [PATCH 2/8] fixup! Use PrecompileTools to warmup CUDA.jl

---
 src/precompile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/precompile.jl b/src/precompile.jl
index 8cf8223f51..d765847840 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -15,7 +15,7 @@ precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
 
-@static if VERSION >= v"1.11.-"
+@static if VERSION >= v"1.11.0-DEV.1603"
 using PrecompileTools: @setup_workload, @compile_workload
 @setup_workload let
     @compile_workload begin

From 80ec8695576b8b5ef24df67d2a0b7f6df1f09828 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 17 Apr 2024 16:51:41 -0400
Subject: [PATCH 3/8] fixup! Use PrecompileTools to warmup CUDA.jl

---
 src/precompile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/precompile.jl b/src/precompile.jl
index d765847840..6f5a46ff04 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -15,8 +15,8 @@ precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
 
-@static if VERSION >= v"1.11.0-DEV.1603"
 using PrecompileTools: @setup_workload, @compile_workload
+@static if VERSION >= v"1.11.0-DEV.1603"
 @setup_workload let
     @compile_workload begin
         target = PTXCompilerTarget(; cap=v"7.5")

From 69142c0a01894a26a4baeaa79f90decd1cfb2f32 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Mon, 15 Apr 2024 13:24:31 -0400
Subject: [PATCH 4/8] Use PrecompileTools to warmup CUDA.jl

---
 Project.toml      |  2 ++
 src/precompile.jl | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/Project.toml b/Project.toml
index 1907f17205..7a5cc0a0f5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,6 +23,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -65,6 +66,7 @@ Libdl = "1"
 LinearAlgebra = "1"
 Logging = "1"
 NVTX = "0.3.2"
+PrecompileTools = "1.2.1"
 Preferences = "1"
 PrettyTables = "2"
 Printf = "1"
diff --git a/src/precompile.jl b/src/precompile.jl
index dc1b2ce2ec..8cf8223f51 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -14,3 +14,17 @@ precompile(run_and_collect, (Cmd,))
 precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
+
+@static if VERSION >= v"1.11.-"
+using PrecompileTools: @setup_workload, @compile_workload
+@setup_workload let
+    @compile_workload begin
+        target = PTXCompilerTarget(; cap=v"7.5")
+        params = CUDACompilerParams(; cap=v"7.5", ptx=v"7.5")
+        config = CompilerConfig(target, params)
+        mi = GPUCompiler.methodinstance(typeof(identity), Tuple{Nothing})
+        job = CompilerJob(mi, config)
+        GPUCompiler.code_native(devnull, job)
+    end
+end
+end

From d27e5eb59cc3e56611a1250ff6eb2dbbb2cc1f6e Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 17 Apr 2024 16:50:20 -0400
Subject: [PATCH 5/8] fixup! Use PrecompileTools to warmup CUDA.jl

---
 src/precompile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/precompile.jl b/src/precompile.jl
index 8cf8223f51..d765847840 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -15,7 +15,7 @@ precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
 
-@static if VERSION >= v"1.11.-"
+@static if VERSION >= v"1.11.0-DEV.1603"
 using PrecompileTools: @setup_workload, @compile_workload
 @setup_workload let
     @compile_workload begin

From c7f880cf9b72570673528771e963814fd6a0f131 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Wed, 17 Apr 2024 16:51:41 -0400
Subject: [PATCH 6/8] fixup! Use PrecompileTools to warmup CUDA.jl

---
 src/precompile.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/precompile.jl b/src/precompile.jl
index d765847840..6f5a46ff04 100644
--- a/src/precompile.jl
+++ b/src/precompile.jl
@@ -15,8 +15,8 @@ precompile(cudaconvert, (Function,))
 precompile(Core.kwfunc(cudacall), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(cudacall),CuFunction,Type{Tuple{}}))
 precompile(Core.kwfunc(launch), (NamedTuple{(:threads, :blocks), Tuple{Int64, Int64}},typeof(launch),CuFunction))
 
-@static if VERSION >= v"1.11.0-DEV.1603"
 using PrecompileTools: @setup_workload, @compile_workload
+@static if VERSION >= v"1.11.0-DEV.1603"
 @setup_workload let
     @compile_workload begin
         target = PTXCompilerTarget(; cap=v"7.5")

From 09efa88cfcc415ad0d5ded0f1ce98d766f1a1ca2 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 19 Apr 2024 14:19:02 -0400
Subject: [PATCH 7/8] Mark more methods as device methods

---
 src/device/intrinsics/atomics.jl            |  6 +++---
 src/device/intrinsics/cooperative_groups.jl |  9 +++++----
 src/device/intrinsics/indexing.jl           |  8 ++++----
 src/device/intrinsics/misc.jl               |  8 ++++----
 src/device/intrinsics/synchronization.jl    | 18 +++++++++---------
 src/device/intrinsics/version.jl            |  2 +-
 src/device/intrinsics/warp.jl               |  6 +++---
 src/device/intrinsics/wmma.jl               | 10 +++++-----
 8 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/device/intrinsics/atomics.jl b/src/device/intrinsics/atomics.jl
index c81aa48421..0874adf770 100644
--- a/src/device/intrinsics/atomics.jl
+++ b/src/device/intrinsics/atomics.jl
@@ -151,7 +151,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16)
     end
 
     intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;"
-    @eval @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) =
+    @eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) =
         @asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val)
 end
 
@@ -172,7 +172,7 @@ for A in (AS.Generic, AS.Global, AS.Shared)
         nb = sizeof(T)*8
         fn = Symbol("atomic_$(op)!")
         intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb"
-        @eval @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) =
+        @eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) =
             @typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val)
     end
 end
@@ -192,7 +192,7 @@ for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
     end
 
     intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
-    @eval @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
+    @eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
         @asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
 end
 
diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl
index 4d87c7070e..deddac3cb5 100644
--- a/src/device/intrinsics/cooperative_groups.jl
+++ b/src/device/intrinsics/cooperative_groups.jl
@@ -70,7 +70,7 @@ const grid_workspace = Ptr{grid_workspace_st}
     end
 end
 
-function get_grid_workspace()
+@device_function function get_grid_workspace()
     # interpret the address from envreg 1 and 2 as the driver's grid workspace
     hi = ccall("llvm.nvvm.read.ptx.sreg.envreg1", llvmcall, UInt32, ())
     lo = ccall("llvm.nvvm.read.ptx.sreg.envreg2", llvmcall, UInt32, ())
@@ -370,7 +370,7 @@ end
     return oldArrive
 end
 
-@inline function barrier_wait(gg::grid_group, token)
+@device_function @inline function barrier_wait(gg::grid_group, token)
     arrived = gg.details.barrier
 
     if is_cta_master()
@@ -548,11 +548,12 @@ end
 
 ## pipeline operations
 
-pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
+@device_function pipeline_commit() = ccall("llvm.nvvm.cp.async.commit.group", llvmcall, Cvoid, ())
 
-pipeline_wait_prior(n) =
+@device_function pipeline_wait_prior(n) =
     ccall("llvm.nvvm.cp.async.wait.group", llvmcall, Cvoid, (Int32,), n)
 
+# TODO device function?
 @generated function pipeline_memcpy_async(dst::LLVMPtr{T}, src::LLVMPtr{T}) where T
     size_and_align = sizeof(T)
     size_and_align in (4, 8, 16) || :(return error($"Unsupported size $size_and_align"))
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index 7677c553f5..e03568ffdb 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -96,14 +96,14 @@ Returns the thread index within the block.
 
 Returns the warp size (in threads).
 """
-@inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ())
+@device_function @inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ())
 
 """
     laneid()::Int32
 
 Returns the thread's lane within the warp.
 """
-@inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32
+@device_function @inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32
 
 """
     lanemask(pred)::UInt32
@@ -111,7 +111,7 @@ Returns the thread's lane within the warp.
 Returns a 32-bit mask indicating which threads in a warp satisfy the given predicate.
 Supported predicates are `==`, `<`, `<=`, `>=`, and `>`.
 """
-@inline function lanemask(pred::F) where F
+@device_function @inline function lanemask(pred::F) where F
     if pred === Base.:(==)
         ccall("llvm.nvvm.read.ptx.sreg.lanemask.eq", llvmcall, UInt32, ())
     elseif pred === Base.:(<)
@@ -133,7 +133,7 @@ end
 Returns a 32-bit mask indicating which threads in a warp are active with the current
 executing thread.
 """
-@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
+@device_function @inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
 
 end
 
diff --git a/src/device/intrinsics/misc.jl b/src/device/intrinsics/misc.jl
index ff8bfaaeea..8184ee1d67 100644
--- a/src/device/intrinsics/misc.jl
+++ b/src/device/intrinsics/misc.jl
@@ -5,21 +5,21 @@ export clock, nanosleep
 
 Terminate a thread.
 """
-exit() = @asmcall("exit;")
+@device_function exit() = @asmcall("exit;")
 
 """
     clock(UInt32)
 
 Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
 """
-clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ())
+@device_function clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ())
 
 """
     clock(UInt64)
 
 Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
 """
-clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ())
+@device_function clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ())
 
 
 """
@@ -30,7 +30,7 @@ Puts a thread for a given amount `t`(in nanoseconds).
 !!! note
     Requires CUDA >= 10.0 and sm_6.2
 """
-@inline function nanosleep(t::Unsigned)
+@device_function @inline function nanosleep(t::Unsigned)
     @asmcall("nanosleep.u32 \$0;", "r", true,
              Cvoid, Tuple{UInt32}, convert(UInt32, t))
 end
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index bb06e9dd09..4e25e94366 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -13,7 +13,7 @@ Waits until all threads in the thread block have reached this point and all glob
 shared memory accesses made by these threads prior to `sync_threads()` are visible to all
 threads in the block.
 """
-@inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ())
+@device_function @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ())
 
 """
     sync_threads_count(predicate)
@@ -22,7 +22,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns the number of threads for which `predicate` evaluates
 to true.
 """
-@inline sync_threads_count(predicate) =
+@device_function @inline sync_threads_count(predicate) =
     ccall("llvm.nvvm.barrier0.popc", llvmcall, Int32, (Int32,), predicate)
 
 """
@@ -32,7 +32,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns `true` if and only if `predicate` evaluates to `true`
 for all of them.
 """
-@inline sync_threads_and(predicate) =
+@device_function @inline sync_threads_and(predicate) =
     ccall("llvm.nvvm.barrier0.and", llvmcall, Int32, (Int32,), predicate) != Int32(0)
 
 """
@@ -42,7 +42,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns `true` if and only if `predicate` evaluates to `true`
 for any of them.
 """
-@inline sync_threads_or(predicate) =
+@device_function @inline sync_threads_or(predicate) =
     ccall("llvm.nvvm.barrier0.or", llvmcall, Int32, (Int32,), predicate) != Int32(0)
 
 """
@@ -56,7 +56,7 @@ the warp.
 !!! note
     Requires CUDA >= 9.0 and sm_6.2
 """
-@inline sync_warp(mask=FULL_MASK) =
+@device_function @inline sync_warp(mask=FULL_MASK) =
     ccall("llvm.nvvm.bar.warp.sync", llvmcall, Cvoid, (UInt32,), mask)
 
 
@@ -64,7 +64,7 @@ the warp.
 
 export barrier_sync
 
-barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
+@device_function @inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
 
 
 ## memory barriers (membar)
@@ -81,7 +81,7 @@ A memory fence that ensures that:
 - All reads from all memory made by the calling thread before the call to `threadfence_block()`
   are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`.
 """
-@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+@device_function @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
 
 """
     threadfence()
@@ -95,7 +95,7 @@ Note that for this ordering guarantee to be true, the observing threads must tru
 memory and not cached versions of it; this is requires the use of volatile loads and stores,
 which is not available from Julia right now.
 """
-@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+@device_function @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
 
 """
     threadfence_system()
@@ -106,4 +106,4 @@ before the call to `threadfence_system()` are observed by all threads in the dev
 host threads, and all threads in peer devices as occurring before all writes to all
 memory made by the calling thread after the call to `threadfence_system()`.
 """
-@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+@device_function @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
diff --git a/src/device/intrinsics/version.jl b/src/device/intrinsics/version.jl
index ac74dff779..714ec2d88b 100644
--- a/src/device/intrinsics/version.jl
+++ b/src/device/intrinsics/version.jl
@@ -50,7 +50,7 @@ end
 export compute_capability, ptx_isa_version
 
 for var in ["sm_major", "sm_minor", "ptx_major", "ptx_minor"]
-    @eval @inline $(Symbol(var))() =
+    @eval @device_function @inline $(Symbol(var))() =
         Base.llvmcall(
             $("""@$var = external global i32
                  define i32 @entry() #0 {
diff --git a/src/device/intrinsics/warp.jl b/src/device/intrinsics/warp.jl
index 91f16cc75c..63913c5aa7 100644
--- a/src/device/intrinsics/warp.jl
+++ b/src/device/intrinsics/warp.jl
@@ -26,7 +26,7 @@ for (name, mode, mask, offset) in (("_up",   :up,   UInt32(0x00), src->src),
     for (T,typ) in ((Int32, "i32"), (UInt32, "i32"), (Float32, "f32"))
         intrinsic = "llvm.nvvm.shfl.sync.$mode.$typ"
         @eval begin
-            @inline $fname(mask, val::$T, src, width=$ws) =
+            @device_function @inline $fname(mask, val::$T, src, width=$ws) =
                 ccall($intrinsic, llvmcall, $T,
                       (UInt32, $T, UInt32, UInt32),
                       mask, val, $(offset(:src)), pack(width, $mask))
@@ -109,7 +109,7 @@ for mode in (:all, :any, :uni)
     @eval export $fname
 
     intrinsic = "llvm.nvvm.vote.$mode.sync"
-    @eval @inline $fname(mask, pred) =
+    @eval @device_function @inline $fname(mask, pred) =
         @typed_ccall($intrinsic, llvmcall, Bool, (UInt32, Bool), mask, pred)
 end
 
@@ -119,7 +119,7 @@ for mode in (:ballot, )
     @eval export $fname
 
     intrinsic = "llvm.nvvm.vote.$mode.sync"
-    @eval @inline $fname(mask, pred) =
+    @eval @device_function @inline $fname(mask, pred) =
         @typed_ccall($intrinsic, llvmcall, UInt32, (UInt32, Bool), mask, pred)
 end
 
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
index fe8415af40..0f30c32031 100644
--- a/src/device/intrinsics/wmma.jl
+++ b/src/device/intrinsics/wmma.jl
@@ -196,10 +196,10 @@ for ops in all_ldst_ops,
     ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
 
     if sz == 1
-        @eval $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
+        @eval @device_function $func_name(src_addr, stride) = tuple(ccall($ccall_name, llvmcall, $frag_ty, ($ptr_ty, Int32), src_addr, stride))
     else
         struct_ty = Symbol("LLVMStruct$sz")
-        @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
+        @eval @device_function $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
     end
     @eval export $func_name
     @eval @doc (@doc llvm_wmma_load) $func_name
@@ -263,7 +263,7 @@ export llvm_wmma_store
 
     ptr_ty = :(LLVMPtr{$arr_ty, $addr_space_int})
 
-    @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
+    @eval @device_function $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
     @eval export $func_name
     @eval @doc (@doc llvm_wmma_store) $func_name
 end
@@ -340,10 +340,10 @@ for ops in all_wmma_ops,
     c_vars = ntuple(i -> :(c[$i]), c_sz)
 
     if d_sz == 1
-        @eval $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+        @eval @device_function $func_name(a, b, c) = tuple(ccall($ccall_name, llvmcall, $d_frag_ty, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
     else
         struct_ty = Symbol("LLVMStruct$d_sz")
-        @eval $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
+        @eval @device_function $func_name(a, b, c) = convert(NTuple{$d_sz, $d_frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$d_frag_ty}, ($(a_types...), $(b_types...), $(c_types...)), $(a_vars...), $(b_vars...), $(c_vars...)))
     end
     @eval export $func_name
     @eval @doc (@doc llvm_wmma_mma) $func_name

From 36c125d3332f8c21e3730f07c43ae2cdd1101bd6 Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Fri, 19 Apr 2024 15:07:57 -0400
Subject: [PATCH 8/8] fixup! Mark more methods as device methods

---
 src/device/intrinsics/cooperative_groups.jl |  2 +-
 src/device/intrinsics/indexing.jl           |  8 ++++----
 src/device/intrinsics/misc.jl               | 11 +++++++----
 src/device/intrinsics/synchronization.jl    | 21 ++++++++++++---------
 src/device/intrinsics/wmma.jl               |  2 +-
 src/device/utils.jl                         |  6 +++---
 6 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/src/device/intrinsics/cooperative_groups.jl b/src/device/intrinsics/cooperative_groups.jl
index deddac3cb5..ab8ece277c 100644
--- a/src/device/intrinsics/cooperative_groups.jl
+++ b/src/device/intrinsics/cooperative_groups.jl
@@ -24,7 +24,7 @@ Noteworthy missing functionality:
 module CG
 
 using ..CUDA
-using ..CUDA: i32, Aligned, alignment
+using ..CUDA: i32, Aligned, alignment, @device_function
 
 using ..LLVM.Interop
 using ..LLVMLoopInfo
diff --git a/src/device/intrinsics/indexing.jl b/src/device/intrinsics/indexing.jl
index e03568ffdb..7677c553f5 100644
--- a/src/device/intrinsics/indexing.jl
+++ b/src/device/intrinsics/indexing.jl
@@ -96,14 +96,14 @@ Returns the thread index within the block.
 
 Returns the warp size (in threads).
 """
-@device_function @inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ())
+@inline warpsize() = ccall("llvm.nvvm.read.ptx.sreg.warpsize", llvmcall, Int32, ())
 
 """
     laneid()::Int32
 
 Returns the thread's lane within the warp.
 """
-@device_function @inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32
+@inline laneid() = ccall("llvm.nvvm.read.ptx.sreg.laneid", llvmcall, Int32, ()) + 1i32
 
 """
     lanemask(pred)::UInt32
@@ -111,7 +111,7 @@ Returns the thread's lane within the warp.
 Returns a 32-bit mask indicating which threads in a warp satisfy the given predicate.
 Supported predicates are `==`, `<`, `<=`, `>=`, and `>`.
 """
-@device_function @inline function lanemask(pred::F) where F
+@inline function lanemask(pred::F) where F
     if pred === Base.:(==)
         ccall("llvm.nvvm.read.ptx.sreg.lanemask.eq", llvmcall, UInt32, ())
     elseif pred === Base.:(<)
@@ -133,7 +133,7 @@ end
 Returns a 32-bit mask indicating which threads in a warp are active with the current
 executing thread.
 """
-@device_function @inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
+@inline active_mask() = @asmcall("activemask.b32 \$0;", "=r", false, UInt32, Tuple{})
 
 end
 
diff --git a/src/device/intrinsics/misc.jl b/src/device/intrinsics/misc.jl
index 8184ee1d67..bd1219a053 100644
--- a/src/device/intrinsics/misc.jl
+++ b/src/device/intrinsics/misc.jl
@@ -1,25 +1,26 @@
 export clock, nanosleep
 
+@device_functions begin
 """
     exit()
 
 Terminate a thread.
 """
-@device_function exit() = @asmcall("exit;")
+exit() = @asmcall("exit;")
 
 """
     clock(UInt32)
 
 Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
 """
-@device_function clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ())
+clock(::Type{UInt32}) = ccall("llvm.nvvm.read.ptx.sreg.clock", llvmcall, UInt32, ())
 
 """
     clock(UInt64)
 
 Returns the value of a per-multiprocessor counter that is incremented every clock cycle.
 """
-@device_function clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ())
+clock(::Type{UInt64}) = ccall("llvm.nvvm.read.ptx.sreg.clock64", llvmcall, UInt64, ())
 
 
 """
@@ -30,7 +31,9 @@ Puts a thread for a given amount `t`(in nanoseconds).
 !!! note
     Requires CUDA >= 10.0 and sm_6.2
 """
-@device_function @inline function nanosleep(t::Unsigned)
+@inline function nanosleep(t::Unsigned)
     @asmcall("nanosleep.u32 \$0;", "r", true,
              Cvoid, Tuple{UInt32}, convert(UInt32, t))
 end
+
+end
diff --git a/src/device/intrinsics/synchronization.jl b/src/device/intrinsics/synchronization.jl
index 4e25e94366..7d1070ff51 100644
--- a/src/device/intrinsics/synchronization.jl
+++ b/src/device/intrinsics/synchronization.jl
@@ -6,6 +6,7 @@
 export sync_threads, sync_warp
 export sync_threads_count, sync_threads_and, sync_threads_or
 
+@device_functions begin
 """
     sync_threads()
 
@@ -13,7 +14,7 @@ Waits until all threads in the thread block have reached this point and all glob
 shared memory accesses made by these threads prior to `sync_threads()` are visible to all
 threads in the block.
 """
-@device_function @inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ())
+@inline sync_threads() = ccall("llvm.nvvm.barrier0", llvmcall, Cvoid, ())
 
 """
     sync_threads_count(predicate)
@@ -22,7 +23,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns the number of threads for which `predicate` evaluates
 to true.
 """
-@device_function @inline sync_threads_count(predicate) =
+@inline sync_threads_count(predicate) =
     ccall("llvm.nvvm.barrier0.popc", llvmcall, Int32, (Int32,), predicate)
 
 """
@@ -32,7 +33,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns `true` if and only if `predicate` evaluates to `true`
 for all of them.
 """
-@device_function @inline sync_threads_and(predicate) =
+@inline sync_threads_and(predicate) =
     ccall("llvm.nvvm.barrier0.and", llvmcall, Int32, (Int32,), predicate) != Int32(0)
 
 """
@@ -42,7 +43,7 @@ Identical to `sync_threads()` with the additional feature that it evaluates pred
 all threads of the block and returns `true` if and only if `predicate` evaluates to `true`
 for any of them.
 """
-@device_function @inline sync_threads_or(predicate) =
+@inline sync_threads_or(predicate) =
     ccall("llvm.nvvm.barrier0.or", llvmcall, Int32, (Int32,), predicate) != Int32(0)
 
 """
@@ -56,7 +57,7 @@ the warp.
 !!! note
     Requires CUDA >= 9.0 and sm_6.2
 """
-@device_function @inline sync_warp(mask=FULL_MASK) =
+@inline sync_warp(mask=FULL_MASK) =
     ccall("llvm.nvvm.bar.warp.sync", llvmcall, Cvoid, (UInt32,), mask)
 
 
@@ -64,7 +65,7 @@ the warp.
 
 export barrier_sync
 
-@device_function @inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
+@inline barrier_sync(id=0) = ccall("llvm.nvvm.barrier.sync", llvmcall, Cvoid, (Int32,), id)
 
 
 ## memory barriers (membar)
@@ -81,7 +82,7 @@ A memory fence that ensures that:
 - All reads from all memory made by the calling thread before the call to `threadfence_block()`
   are ordered before all reads from all memory made by the calling thread after the call to `threadfence_block()`.
 """
-@device_function @inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
+@inline threadfence_block() = ccall("llvm.nvvm.membar.cta", llvmcall, Cvoid, ())
 
 """
     threadfence()
@@ -95,7 +96,7 @@ Note that for this ordering guarantee to be true, the observing threads must tru
 memory and not cached versions of it; this is requires the use of volatile loads and stores,
 which is not available from Julia right now.
 """
-@device_function @inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
+@inline threadfence() = ccall("llvm.nvvm.membar.gl", llvmcall, Cvoid, ())
 
 """
     threadfence_system()
@@ -106,4 +107,6 @@ before the call to `threadfence_system()` are observed by all threads in the dev
 host threads, and all threads in peer devices as occurring before all writes to all
 memory made by the calling thread after the call to `threadfence_system()`.
 """
-@device_function @inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+@inline threadfence_system() = ccall("llvm.nvvm.membar.sys", llvmcall, Cvoid, ())
+
+end
diff --git a/src/device/intrinsics/wmma.jl b/src/device/intrinsics/wmma.jl
index 0f30c32031..a7d449cada 100644
--- a/src/device/intrinsics/wmma.jl
+++ b/src/device/intrinsics/wmma.jl
@@ -1,7 +1,7 @@
 export WMMA
 module WMMA
 
-using ..CUDA: AS
+using ..CUDA: AS, @device_function
 using Core: LLVMPtr
 
 ################################################################################
diff --git a/src/device/utils.jl b/src/device/utils.jl
index c6463360ca..d30e1bdffe 100644
--- a/src/device/utils.jl
+++ b/src/device/utils.jl
@@ -16,7 +16,7 @@ end
 macro device_override(ex)
     ex = macroexpand(__module__, ex)
     esc(quote
-        Base.Experimental.@overlay(CUDA.method_table, $ex)
+        Base.Experimental.@overlay($(CUDA).method_table, $ex)
     end)
 end
 
@@ -31,7 +31,7 @@ macro device_function(ex)
 
     esc(quote
         $(combinedef(def))
-        @device_override $ex
+        $(CUDA).@device_override $ex
     end)
 end
 
@@ -47,7 +47,7 @@ macro device_functions(ex)
                 push!(out.args, rewrite(arg))
             elseif Meta.isexpr(arg, [:function, :(=)])
                 # rewrite function definitions
-                push!(out.args, :(@device_function $arg))
+                push!(out.args, :($(CUDA).@device_function $arg))
             else
                 # preserve all the rest
                 push!(out.args, arg)