From b48762d86fe213ae5d924c222f71c94b0196f6f8 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Thu, 19 Dec 2024 11:47:46 -0800
Subject: [PATCH 1/8] support ptx code type for program

---
 cuda_core/cuda/core/experimental/_program.py | 11 +++++++++--
 cuda_core/tests/test_program.py              | 14 +++++++++-----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 0dac79eb..8f7e6a9a 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -39,22 +39,29 @@ def close(self):
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend")
-    _supported_code_type = ("c++",)
+    _supported_code_type = ("c++", "ptx")
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
     def __init__(self, code, code_type):
         self._mnff = Program._MembersNeededForFinalize(self, None)
 
+        code_type = code_type.lower()
         if code_type not in self._supported_code_type:
             raise NotImplementedError
 
-        if code_type.lower() == "c++":
+        if code_type == "c++":
             if not isinstance(code, str):
                 raise TypeError
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
+
+        elif code_type == "ptx":
+            if not isinstance(code, str):
+                raise TypeError
+            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
+            self._backend = "nvrtc"
         else:
             raise NotImplementedError
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index cca01af5..553ffef3 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -38,11 +38,15 @@ def test_program_compile_valid_target_type():
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
     arch = "".join(str(i) for i in Device().compute_capability)
-    object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",))
-    print(object_code._module.decode())
-    kernel = object_code.get_kernel("my_kernel")
-    assert isinstance(object_code, ObjectCode)
-    assert isinstance(kernel, Kernel)
+    ptx_object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",))
+    program = Program(ptx_object_code.code, "ptx")
+    cubin_object_code = program.compile("cubin", options=(f"-arch=compute_{arch}",))
+    ptx_kernel = ptx_object_code.get_kernel("my_kernel")
+    cubin_kernel = cubin_object_code.get_kernel("my_kernel")
+    assert isinstance(ptx_object_code, ObjectCode)
+    assert isinstance(cubin_object_code, ObjectCode)
+    assert isinstance(ptx_kernel, Kernel)
+    assert isinstance(cubin_kernel, Kernel)
 
 
 def test_program_compile_invalid_target_type():

From 7c39f7b80a58f77ff7b8587d049d3769b5f5d777 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Fri, 20 Dec 2024 12:53:47 -0800
Subject: [PATCH 2/8] use linker as the backend when Program takes ptx. WIP
 needs the ProgramOptions change to go in

---
 cuda_core/cuda/core/experimental/_program.py | 16 ++++++++++++----
 cuda_core/tests/test_program.py              |  3 ++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 8f7e6a9a..d947cfd2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -5,6 +5,7 @@
 import weakref
 
 from cuda import nvrtc
+from cuda.core.experimental._linker import Linker, LinkerOptions
 from cuda.core.experimental._module import ObjectCode
 from cuda.core.experimental._utils import handle_return
 
@@ -27,7 +28,7 @@ class Program:
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = ("handle",)
+        __slots__ = "handle"
 
         def __init__(self, program_obj, handle):
             self.handle = handle
@@ -38,7 +39,7 @@ def close(self):
                 handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
                 self.handle = None
 
-    __slots__ = ("__weakref__", "_mnff", "_backend")
+    __slots__ = ("__weakref__", "_mnff", "_backend", "_linker")
     _supported_code_type = ("c++", "ptx")
     _supported_target_type = ("ptx", "cubin", "ltoir")
 
@@ -60,11 +61,15 @@ def __init__(self, code, code_type):
         elif code_type == "ptx":
             if not isinstance(code, str):
                 raise TypeError
-            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
-            self._backend = "nvrtc"
+            # TODO: support pre-loaded headers & include names
+            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
+            self._linker = Linker(ObjectCode(code.encode(), code_type), options=LinkerOptions(arch="sm_89"))
+            self._backend = "linker"
         else:
             raise NotImplementedError
 
+        print(self._backend)
+
     def close(self):
         """Destroy this program."""
         self._mnff.close()
@@ -129,6 +134,9 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None):
 
             return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
 
+        if self._backend == "linker":
+            return self._linker.link(target_type)
+
     @property
     def backend(self):
         """Return the backend type string associated with this program."""
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 553ffef3..01ab2724 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -39,7 +39,8 @@ def test_program_compile_valid_target_type():
     program = Program(code, "c++")
     arch = "".join(str(i) for i in Device().compute_capability)
     ptx_object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",))
-    program = Program(ptx_object_code.code, "ptx")
+    print(ptx_object_code._module.decode())
+    program = Program(ptx_object_code._module.decode(), "ptx")
     cubin_object_code = program.compile("cubin", options=(f"-arch=compute_{arch}",))
     ptx_kernel = ptx_object_code.get_kernel("my_kernel")
     cubin_kernel = cubin_object_code.get_kernel("my_kernel")

From 7a8f7735dd4d1955ac36f5272573b276c665ff3c Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 20 Jan 2025 15:39:15 -0800
Subject: [PATCH 3/8] add plumbing

---
 cuda_core/cuda/core/experimental/_linker.py  | 39 +++--------
 cuda_core/cuda/core/experimental/_program.py | 37 -----------
 cuda_core/tests/test_linker.py               | 12 ++--
 cuda_core/tests/test_program.py              | 69 +++++++++++++++-----
 4 files changed, 67 insertions(+), 90 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index b5a6b675..0f737805 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -3,11 +3,11 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import ctypes
-import warnings
 import weakref
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import List, Optional
+from warnings import warn
 
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
@@ -43,7 +43,7 @@ def _decide_nvjitlink_or_driver():
             _nvjitlink = None
 
     if _nvjitlink is None:
-        warnings.warn(
+        warn(
             "nvJitLink is not installed or too old (<12.3). Therefore it is not usable "
             "and the culink APIs will be used instead.",
             stacklevel=3,
@@ -98,78 +98,59 @@ class LinkerOptions:
         will be used.
     max_register_count : int, optional
         Maximum register count.
-        Maps to: ``-maxrregcount=<N>``.
     time : bool, optional
         Print timing information to the info log.
-        Maps to ``-time``.
         Default: False.
     verbose : bool, optional
         Print verbose messages to the info log.
-        Maps to ``-verbose``.
         Default: False.
     link_time_optimization : bool, optional
         Perform link time optimization.
-        Maps to: ``-lto``.
         Default: False.
     ptx : bool, optional
-        Emit PTX after linking instead of CUBIN; only supported with ``-lto``.
-        Maps to ``-ptx``.
+        Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``.
         Default: False.
     optimization_level : int, optional
         Set optimization level. Only 0 and 3 are accepted.
-        Maps to ``-O<N>``.
     debug : bool, optional
         Generate debug information.
-        Maps to ``-g``
         Default: False.
     lineinfo : bool, optional
         Generate line information.
-        Maps to ``-lineinfo``.
         Default: False.
     ftz : bool, optional
         Flush denormal values to zero.
-        Maps to ``-ftz=<n>``.
         Default: False.
     prec_div : bool, optional
         Use precise division.
-        Maps to ``-prec-div=<n>``.
         Default: True.
     prec_sqrt : bool, optional
         Use precise square root.
-        Maps to ``-prec-sqrt=<n>``.
         Default: True.
     fma : bool, optional
         Use fast multiply-add.
-        Maps to ``-fma=<n>``.
         Default: True.
     kernels_used : List[str], optional
         Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple
         times.
-        Maps to ``-kernels-used=<name>``.
     variables_used : List[str], optional
         Pass a list of variables that are used; any not in the list can be removed.
-        Maps to ``-variables-used=<name>``
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
-        Maps to: ``-optimize-unused-variables``
         Default: False.
     xptxas : List[str], optional
         Pass options to PTXAS.
-        Maps to: ``-Xptxas=<opt>``.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
         compilation (default).
-        Maps to ``-split-compile=<N>``.
         Default: 1.
     split_compile_extended : int, optional
         A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value.
         Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This
         option can potentially impact performance of the compiled binary.
-        Maps to ``-split-compile-extended=<N>``.
         Default: 1.
     no_cache : bool, optional
         Do not cache the intermediate steps of nvJitLink.
-        Maps to ``-no-cache``.
         Default: False.
     """
 
@@ -290,19 +271,19 @@ def _init_driver(self):
             self.formatted_options.append(1)
             self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO)
         if self.ftz is not None:
-            raise ValueError("ftz option is deprecated in the driver API")
+            warn("ftz option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.prec_div is not None:
-            raise ValueError("prec_div option is deprecated in the driver API")
+            warn("prec_div option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.prec_sqrt is not None:
-            raise ValueError("prec_sqrt option is deprecated in the driver API")
+            warn("prec_sqrt option is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.fma is not None:
-            raise ValueError("fma options is deprecated in the driver API")
+            warn("fma options is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.kernels_used is not None:
-            raise ValueError("kernels_used is deprecated in the driver API")
+            warn("kernels_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.variables_used is not None:
-            raise ValueError("variables_used is deprecated in the driver API")
+            warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.optimize_unused_variables is not None:
-            raise ValueError("optimize_unused_variables is deprecated in the driver API")
+            warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.xptxas is not None:
             raise ValueError("xptxas option is not supported by the driver API")
         if self.split_compile is not None:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index f0870548..ed7c38c2 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -32,157 +32,120 @@ class ProgramOptions:
     relocatable_device_code : bool, optional
         Enable (disable) the generation of relocatable device code.
         Default: False
-        Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``)
     extensible_whole_program : bool, optional
         Do extensible whole program compilation of device code.
         Default: False
-        Maps to: ``--extensible-whole-program`` (``-ewp``)
     debug : bool, optional
         Generate debug information. If --dopt is not specified, then turns off all optimizations.
         Default: False
-        Maps to: ``--device-debug`` (``-G``)
     lineinfo: bool, optional
         Generate line-number information.
         Default: False
-        Maps to: ``--generate-line-info`` (``-lineinfo``)
     device_code_optimize : bool, optional
         Enable device code optimization. When specified along with ‘-G’, enables limited debug information generation
         for optimized device code.
         Default: None
-        Maps to: ``--dopt on`` (``-dopt``)
     ptxas_options : Union[str, List[str]], optional
         Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
         For example ["-v", "-O2"].
         Default: None
-        Maps to: ``--ptxas-options <options>`` (``-Xptxas``)
     max_register_count : int, optional
         Specify the maximum amount of registers that GPU functions can use.
         Default: None
-        Maps to: ``--maxrregcount=<N>`` (``-maxrregcount``)
     ftz : bool, optional
         When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
         values.
         Default: False
-        Maps to: ``--ftz={true|false}`` (``-ftz``)
     prec_sqrt : bool, optional
         For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
         Default: True
-        Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``)
     prec_div : bool, optional
         For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
         approximation.
         Default: True
-        Maps to: ``--prec-div={true|false}`` (``-prec-div``)
     fma : bool, optional
         Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
         multiply-add operations.
         Default: True
-        Maps to: ``--fmad={true|false}`` (``-fmad``)
     use_fast_math : bool, optional
         Make use of fast math operations.
         Default: False
-        Maps to: ``--use_fast_math`` (``-use_fast_math``)
     extra_device_vectorization : bool, optional
         Enables more aggressive device code vectorization in the NVVM optimizer.
         Default: False
-        Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``)
     link_time_optimization : bool, optional
         Generate intermediate code for later link-time optimization.
         Default: False
-        Maps to: ``--dlink-time-opt`` (``-dlto``)
     gen_opt_lto : bool, optional
         Run the optimizer passes before generating the LTO IR.
         Default: False
-        Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``)
     define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional
         Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
         strings, in which case the first element is defined as the second, or a list of strings or tuples.
         Default: None
-        Maps to: ``--define-macro=<def>`` (``-D``)
     undefine_macro : Union[str, List[str]], optional
         Cancel any previous definition of a macro, or list of macros.
         Default: None
-        Maps to: ``--undefine-macro=<def>`` (``-U``)
     include_path : Union[str, List[str]], optional
         Add the directory or directories to the list of directories to be searched for headers.
         Default: None
-        Maps to: ``--include-path=<dir>`` (``-I``)
     pre_include : Union[str, List[str]], optional
         Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
         Default: None
-        Maps to: ``--pre-include=<header>`` (``-include``)
     no_source_include : bool, optional
         Disable the default behavior of adding the directory of each input source to the include path.
         Default: False
-        Maps to: ``--no-source-include`` (``-no-source-include``)
     std : str, optional
         Set language dialect to C++03, C++11, C++14, C++17 or C++20.
         Default: c++17
-        Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``)
     builtin_move_forward : bool, optional
         Provide builtin definitions of std::move and std::forward.
         Default: True
-        Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``)
     builtin_initializer_list : bool, optional
         Provide builtin definitions of std::initializer_list class and member functions.
         Default: True
-        Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``)
     disable_warnings : bool, optional
         Inhibit all warning messages.
         Default: False
-        Maps to: ``--disable-warnings`` (``-w``)
     restrict : bool, optional
         Programmer assertion that all kernel pointer parameters are restrict pointers.
         Default: False
-        Maps to: ``--restrict`` (``-restrict``)
     device_as_default_execution_space : bool, optional
         Treat entities with no execution space annotation as __device__ entities.
         Default: False
-        Maps to: ``--device-as-default-execution-space`` (``-default-device``)
     device_int128 : bool, optional
         Allow the __int128 type in device code.
         Default: False
-        Maps to: ``--device-int128`` (``-device-int128``)
     optimization_info : str, optional
         Provide optimization reports for the specified kind of optimization.
         Default: None
-        Maps to: ``--optimization-info=<kind>`` (``-opt-info``)
     no_display_error_number : bool, optional
         Disable the display of a diagnostic number for warning messages.
         Default: False
-        Maps to: ``--no-display-error-number`` (``-no-err-no``)
     diag_error : Union[int, List[int]], optional
         Emit error for a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: ``--diag-error=<error-number>, ...`` (``-diag-error``)
     diag_suppress : Union[int, List[int]], optional
         Suppress a specified diagnostic message number or comma separated list of numbers.
         Default: None
-        Maps to: ``--diag-suppress=<error-number>,…`` (``-diag-suppress``)
     diag_warn : Union[int, List[int]], optional
         Emit warning for a specified diagnostic message number or comma separated lis of numbers.
         Default: None
-        Maps to: ``--diag-warn=<error-number>,…`` (``-diag-warn``)
     brief_diagnostics : bool, optional
         Disable or enable showing source line and column info in a diagnostic.
         Default: False
-        Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``)
     time : str, optional
         Generate a CSV table with the time taken by each compilation phase.
         Default: None
-        Maps to: ``--time=<file-name>`` (``-time``)
     split_compile : int, optional
         Perform compiler optimizations in parallel.
         Default: 1
-        Maps to: ``--split-compile= <number of threads>`` (``-split-compile``)
     fdevice_syntax_only : bool, optional
         Ends device compilation after front-end syntax checking.
         Default: False
-        Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``)
     minimal : bool, optional
         Omit certain language features to reduce compile time for small programs.
         Default: False
-        Maps to: ``--minimal`` (``-minimal``)
     """
 
     arch: Optional[str] = None
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index b81c1654..0a1fffc5 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -52,8 +52,7 @@ def compile_ltoir_functions(init_cuda):
     LinkerOptions(arch=ARCH, max_register_count=32),
     LinkerOptions(arch=ARCH, optimization_level=3),
     LinkerOptions(arch=ARCH, debug=True),
-    LinkerOptions(arch=ARCH, lineinfo=True),
-    LinkerOptions(arch=ARCH, no_cache=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
+    LinkerOptions(arch=ARCH, lineinfo=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
     # will fail. Tracked in issue #337
 ]
 
@@ -65,6 +64,11 @@ def compile_ltoir_functions(init_cuda):
     else culink_options
     + [
         LinkerOptions(arch=ARCH, time=True),
+        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
+        LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        LinkerOptions(arch=ARCH, split_compile=0),
+        LinkerOptions(arch=ARCH, split_compile_extended=1),
+        # The following options are supported by nvjitlink and deprecated by culink
         LinkerOptions(arch=ARCH, ftz=True),
         LinkerOptions(arch=ARCH, prec_div=True),
         LinkerOptions(arch=ARCH, prec_sqrt=True),
@@ -73,10 +77,6 @@ def compile_ltoir_functions(init_cuda):
         LinkerOptions(arch=ARCH, kernels_used=["C", "B"]),
         LinkerOptions(arch=ARCH, variables_used=["var1"]),
         LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
-        LinkerOptions(arch=ARCH, optimize_unused_variables=True),
-        LinkerOptions(arch=ARCH, xptxas=["-v"]),
-        LinkerOptions(arch=ARCH, split_compile=0),
-        LinkerOptions(arch=ARCH, split_compile_extended=1),
     ],
 )
 def test_linker_init(compile_ptx_functions, options):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 0f9b8e3b..c0769ad1 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -9,16 +9,14 @@
 import pytest
 from conftest import can_load_generated_ptx
 
+from cuda.core.experimental import _linker
 from cuda.core.experimental._module import Kernel, ObjectCode
 from cuda.core.experimental._program import Program, ProgramOptions
 
 
-# TODO handle and test options whcih are only supported on more modern CUDA versions
-# tracked in #337
-def test_program_with_various_options(init_cuda):
-    code = 'extern "C" __global__ void my_kernel() {}'
-
-    options_list = [
+@pytest.mark.parametrize(
+    "options",
+    [
         ProgramOptions(device_code_optimize=True, debug=True),
         ProgramOptions(relocatable_device_code=True, max_register_count=32),
         ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False),
@@ -38,14 +36,44 @@ def test_program_with_various_options(init_cuda):
         ProgramOptions(diag_error=1234, diag_suppress=1234),
         ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)),
         ProgramOptions(diag_warn=1000),
-    ]
+    ],
+)
+def test_cpp_program_with_various_options(init_cuda, options):
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++", options)
+    assert program.backend == "nvrtc"
+    program.compile("ptx")
+    program.close()
+    assert program.handle is None
 
-    for options in options_list:
-        program = Program(code, "c++", options)
-        assert program.backend == "nvrtc"
-        program.compile("ptx")
-        program.close()
-        assert program.handle is None
+
+culink_options = [
+    ProgramOptions(max_register_count=32),
+    ProgramOptions(debug=True),
+    ProgramOptions(lineinfo=True),
+    ProgramOptions(ftz=True),
+    ProgramOptions(prec_div=True),
+    ProgramOptions(prec_sqrt=True),
+    ProgramOptions(fma=True),
+]
+nvjitlink_options = [
+    ProgramOptions(time=True),
+    ProgramOptions(split_compile=True),
+]
+
+
+@pytest.mark.parametrize(
+    "options", culink_options if _linker._decide_nvjitlink_or_driver() else culink_options + nvjitlink_options
+)
+def test_ptx_program_with_various_options(init_cuda, options):
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    ptx_object_code = program.compile("ptx")
+    program = Program(ptx_object_code._module.decode(), "ptx", options=options)
+    assert program.backend == "linker"
+    program.compile("cubin")
+    program.close()
+    assert program.handle is None
 
 
 def test_program_init_valid_code_type():
@@ -70,13 +98,18 @@ def test_program_init_invalid_code_format():
 # TODO: incorporate this check in Program
 # This is tested against the current device's arch
 @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
-def test_program_compile_valid_target_type():
+def test_program_compile_valid_target_type(init_cuda):
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")
-    object_code = program.compile("ptx")
-    kernel = object_code.get_kernel("my_kernel")
-    assert isinstance(object_code, ObjectCode)
-    assert isinstance(kernel, Kernel)
+    ptx_object_code = program.compile("ptx")
+    program = Program(ptx_object_code._module.decode(), "ptx")
+    cubin_object_code = program.compile("cubin")
+    ptx_kernel = ptx_object_code.get_kernel("my_kernel")
+    cubin_kernel = cubin_object_code.get_kernel("my_kernel")
+    assert isinstance(ptx_object_code, ObjectCode)
+    assert isinstance(cubin_object_code, ObjectCode)
+    assert isinstance(ptx_kernel, Kernel)
+    assert isinstance(cubin_kernel, Kernel)
 
 
 def test_program_compile_invalid_target_type():

From 41bfab7f33e17b46b24b3a494518780c49f3f411 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 20 Jan 2025 15:41:25 -0800
Subject: [PATCH 4/8] remove redundant todo

---
 cuda_core/cuda/core/experimental/_program.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index ed7c38c2..2f4ead64 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -384,8 +384,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         elif code_type == "ptx":
             if not isinstance(code, str):
                 raise TypeError
-            # TODO: support pre-loaded headers & include names
-            # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
             self._linker = Linker(
                 ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
             )

From af9b5477d67692c9522f21e2bf9b8e61205f16a5 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Tue, 28 Jan 2025 11:54:39 -0800
Subject: [PATCH 5/8] fix bug

---
 cuda_core/cuda/core/experimental/_linker.py |  4 ++--
 cuda_core/tests/test_program.py             | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 0f737805..3eb71f4e 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -23,11 +23,11 @@
 
 
 # Note: this function is reused in the tests
-def _decide_nvjitlink_or_driver():
+def _decide_nvjitlink_or_driver() -> bool:
     """Returns True if falling back to the cuLink* driver APIs."""
     global _driver_ver, _driver, _nvjitlink
     if _driver or _nvjitlink:
-        return
+        return _driver is not None
 
     _driver_ver = handle_return(driver.cuDriverGetVersion())
     _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10)
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index c0769ad1..11b2e854 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -47,7 +47,7 @@ def test_cpp_program_with_various_options(init_cuda, options):
     assert program.handle is None
 
 
-culink_options = [
+options = [
     ProgramOptions(max_register_count=32),
     ProgramOptions(debug=True),
     ProgramOptions(lineinfo=True),
@@ -56,15 +56,15 @@ def test_cpp_program_with_various_options(init_cuda, options):
     ProgramOptions(prec_sqrt=True),
     ProgramOptions(fma=True),
 ]
-nvjitlink_options = [
-    ProgramOptions(time=True),
-    ProgramOptions(split_compile=True),
-]
+if not _linker._decide_nvjitlink_or_driver():
+    print("Using nvjitlink as the backend because decide() returned false")
+    options += [
+        ProgramOptions(time=True),
+        ProgramOptions(split_compile=True),
+    ]
 
 
-@pytest.mark.parametrize(
-    "options", culink_options if _linker._decide_nvjitlink_or_driver() else culink_options + nvjitlink_options
-)
+@pytest.mark.parametrize("options", options)
 def test_ptx_program_with_various_options(init_cuda, options):
     code = 'extern "C" __global__ void my_kernel() {}'
     program = Program(code, "c++")

From cde6fdff695f893f58ae11631d71284e32b28028 Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 3 Feb 2025 11:30:45 -0800
Subject: [PATCH 6/8] various changes to address review comments

---
 cuda_core/cuda/core/experimental/_linker.py  | 12 ++---
 cuda_core/cuda/core/experimental/_program.py | 49 +++++++++++---------
 cuda_core/tests/test_linker.py               |  4 +-
 cuda_core/tests/test_program.py              | 15 ++++--
 4 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 3eb71f4e..976d739f 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -138,7 +138,7 @@ class LinkerOptions:
     optimize_unused_variables : bool, optional
         Assume that if a variable is not referenced in device code, it can be removed.
         Default: False.
-    xptxas : List[str], optional
+    ptxas_options : List[str], optional
         Pass options to PTXAS.
     split_compile : int, optional
         Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split
@@ -170,7 +170,7 @@ class LinkerOptions:
     kernels_used: Optional[List[str]] = None
     variables_used: Optional[List[str]] = None
     optimize_unused_variables: Optional[bool] = None
-    xptxas: Optional[List[str]] = None
+    ptxas_options: Optional[List[str]] = None
     split_compile: Optional[int] = None
     split_compile_extended: Optional[int] = None
     no_cache: Optional[bool] = None
@@ -220,8 +220,8 @@ def _init_nvjitlink(self):
                 self.formatted_options.append(f"-variables-used={variable}")
         if self.optimize_unused_variables is not None:
             self.formatted_options.append("-optimize-unused-variables")
-        if self.xptxas is not None:
-            for opt in self.xptxas:
+        if self.ptxas_options is not None:
+            for opt in self.ptxas_options:
                 self.formatted_options.append(f"-Xptxas={opt}")
         if self.split_compile is not None:
             self.formatted_options.append(f"-split-compile={self.split_compile}")
@@ -284,8 +284,8 @@ def _init_driver(self):
             warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3)
         if self.optimize_unused_variables is not None:
             warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3)
-        if self.xptxas is not None:
-            raise ValueError("xptxas option is not supported by the driver API")
+        if self.ptxas_options is not None:
+            raise ValueError("ptxas_options option is not supported by the driver API")
         if self.split_compile is not None:
             raise ValueError("split_compile option is not supported by the driver API")
         if self.split_compile_extended is not None:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 2f4ead64..fc47ff85 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -342,23 +342,23 @@ class Program:
     code : Any
         String of the CUDA Runtime Compilation program.
     code_type : Any
-        String of the code type. Currently only ``"c++"`` is supported.
+        String of the code type. Currently ``"ptx"`` and ``"c++"`` are supported.
     options : ProgramOptions, optional
         A ProgramOptions object to customize the compilation process.
         See :obj:`ProgramOptions` for more information.
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = "handle"
+        __slots__ = "nvrtc_handle"
 
-        def __init__(self, program_obj, handle):
-            self.handle = handle
+        def __init__(self, program_obj, nvrtc_handle):
+            self.nvrtc_handle = nvrtc_handle
             weakref.finalize(program_obj, self.close)
 
         def close(self):
-            if self.handle is not None:
-                handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
-                self.handle = None
+            if self.nvrtc_handle is not None:
+                handle_return(nvrtc.nvrtcDestroyProgram(self.nvrtc_handle))
+                self.nvrtc_handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
     _supported_code_type = ("c++", "ptx")
@@ -375,15 +375,16 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
 
         if code_type == "c++":
             if not isinstance(code, str):
-                raise TypeError
+                raise TypeError("c++ Program expects code argument to be a string")
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
+            self._mnff.nvrtc_handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
+            self._linker = None
 
         elif code_type == "ptx":
             if not isinstance(code, str):
-                raise TypeError
+                raise TypeError("ptx Program expects code argument to be a string")
             self._linker = Linker(
                 ObjectCode(code.encode(), code_type), options=self._translate_program_options(options)
             )
@@ -404,6 +405,7 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
             fma=options.fma,
             link_time_optimization=options.link_time_optimization,
             split_compile=options.split_compile,
+            ptxas_options=options.ptxas_options,
         )
 
     def close(self):
@@ -438,35 +440,40 @@ def compile(self, target_type, name_expressions=(), logs=None):
         if self._backend == "nvrtc":
             if name_expressions:
                 for n in name_expressions:
-                    handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle)
+                    handle_return(
+                        nvrtc.nvrtcAddNameExpression(self._mnff.nvrtc_handle, n.encode()),
+                        handle=self._mnff.nvrtc_handle,
+                    )
             options = self._options._as_bytes()
             handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
-                handle=self._mnff.handle,
+                nvrtc.nvrtcCompileProgram(self._mnff.nvrtc_handle, len(options), options),
+                handle=self._mnff.nvrtc_handle,
             )
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
             comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
-            size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
+            size = handle_return(size_func(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle)
             data = b" " * size
-            handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
+            handle_return(comp_func(self._mnff.nvrtc_handle, data), handle=self._mnff.nvrtc_handle)
 
             symbol_mapping = {}
             if name_expressions:
                 for n in name_expressions:
                     symbol_mapping[n] = handle_return(
-                        nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
+                        nvrtc.nvrtcGetLoweredName(self._mnff.nvrtc_handle, n.encode()), handle=self._mnff.nvrtc_handle
                     )
 
             if logs is not None:
-                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
+                logsize = handle_return(
+                    nvrtc.nvrtcGetProgramLogSize(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle
+                )
                 if logsize > 1:
                     log = b" " * logsize
-                    handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
+                    handle_return(
+                        nvrtc.nvrtcGetProgramLog(self._mnff.nvrtc_handle, log), handle=self._mnff.nvrtc_handle
+                    )
                     logs.write(log.decode())
 
-            # TODO: handle jit_options for ptx?
-
             return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
 
         if self._backend == "linker":
@@ -480,4 +487,4 @@ def backend(self):
     @property
     def handle(self):
         """Return the program handle object."""
-        return self._mnff.handle
+        return self._mnff.nvrtc_handle
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 0a1fffc5..627eb2b2 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -52,7 +52,9 @@ def compile_ltoir_functions(init_cuda):
     LinkerOptions(arch=ARCH, max_register_count=32),
     LinkerOptions(arch=ARCH, optimization_level=3),
     LinkerOptions(arch=ARCH, debug=True),
-    LinkerOptions(arch=ARCH, lineinfo=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
+    LinkerOptions(arch=ARCH, lineinfo=True),
+    LinkerOptions(arch=ARCH, lineinfo=True),
+    LinkerOptions(arch=ARCH, no_cache=True),  # TODO: consider adding cuda 12.4 to test matrix in which case this
     # will fail. Tracked in issue #337
 ]
 
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 11b2e854..d9c5cbde 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -14,6 +14,14 @@
 from cuda.core.experimental._program import Program, ProgramOptions
 
 
+@pytest.fixture(scope="module")
+def ptx_code_object():
+    code = 'extern "C" __global__ void my_kernel() {}'
+    program = Program(code, "c++")
+    ptx_object_code = program.compile("ptx")
+    return ptx_object_code
+
+
 @pytest.mark.parametrize(
     "options",
     [
@@ -65,11 +73,8 @@ def test_cpp_program_with_various_options(init_cuda, options):
 
 
 @pytest.mark.parametrize("options", options)
-def test_ptx_program_with_various_options(init_cuda, options):
-    code = 'extern "C" __global__ void my_kernel() {}'
-    program = Program(code, "c++")
-    ptx_object_code = program.compile("ptx")
-    program = Program(ptx_object_code._module.decode(), "ptx", options=options)
+def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options):
+    program = Program(ptx_code_object._module.decode(), "ptx", options=options)
     assert program.backend == "linker"
     program.compile("cubin")
     program.close()

From e079135a728001982291a1e0b50b5f3eac2c14ea Mon Sep 17 00:00:00 2001
From: ksimpson <ksimpson@nvidia.com>
Date: Mon, 3 Feb 2025 14:02:57 -0800
Subject: [PATCH 7/8] fix test bug and forec finalize the linker

---
 cuda_core/cuda/core/experimental/_program.py | 40 ++++++++++----------
 cuda_core/tests/test_linker.py               |  2 +-
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index fc47ff85..f938895e 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -349,16 +349,16 @@ class Program:
     """
 
     class _MembersNeededForFinalize:
-        __slots__ = "nvrtc_handle"
+        __slots__ = "handle"
 
-        def __init__(self, program_obj, nvrtc_handle):
-            self.nvrtc_handle = nvrtc_handle
+        def __init__(self, program_obj, handle):
+            self.handle = handle
             weakref.finalize(program_obj, self.close)
 
         def close(self):
-            if self.nvrtc_handle is not None:
-                handle_return(nvrtc.nvrtcDestroyProgram(self.nvrtc_handle))
-                self.nvrtc_handle = None
+            if self.handle is not None:
+                handle_return(nvrtc.nvrtcDestroyProgram(self.handle))
+                self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
     _supported_code_type = ("c++", "ptx")
@@ -378,7 +378,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
                 raise TypeError("c++ Program expects code argument to be a string")
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
-            self._mnff.nvrtc_handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
+            self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], []))
             self._backend = "nvrtc"
             self._linker = None
 
@@ -410,6 +410,8 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
 
     def close(self):
         """Destroy this program."""
+        if self._linker:
+            self._linker.close()
         self._mnff.close()
 
     def compile(self, target_type, name_expressions=(), logs=None):
@@ -441,37 +443,33 @@ def compile(self, target_type, name_expressions=(), logs=None):
             if name_expressions:
                 for n in name_expressions:
                     handle_return(
-                        nvrtc.nvrtcAddNameExpression(self._mnff.nvrtc_handle, n.encode()),
-                        handle=self._mnff.nvrtc_handle,
+                        nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()),
+                        handle=self._mnff.handle,
                     )
             options = self._options._as_bytes()
             handle_return(
-                nvrtc.nvrtcCompileProgram(self._mnff.nvrtc_handle, len(options), options),
-                handle=self._mnff.nvrtc_handle,
+                nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options),
+                handle=self._mnff.handle,
             )
 
             size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size")
             comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}")
-            size = handle_return(size_func(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle)
+            size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle)
             data = b" " * size
-            handle_return(comp_func(self._mnff.nvrtc_handle, data), handle=self._mnff.nvrtc_handle)
+            handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle)
 
             symbol_mapping = {}
             if name_expressions:
                 for n in name_expressions:
                     symbol_mapping[n] = handle_return(
-                        nvrtc.nvrtcGetLoweredName(self._mnff.nvrtc_handle, n.encode()), handle=self._mnff.nvrtc_handle
+                        nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle
                     )
 
             if logs is not None:
-                logsize = handle_return(
-                    nvrtc.nvrtcGetProgramLogSize(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle
-                )
+                logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle)
                 if logsize > 1:
                     log = b" " * logsize
-                    handle_return(
-                        nvrtc.nvrtcGetProgramLog(self._mnff.nvrtc_handle, log), handle=self._mnff.nvrtc_handle
-                    )
+                    handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle)
                     logs.write(log.decode())
 
             return ObjectCode(data, target_type, symbol_mapping=symbol_mapping)
@@ -487,4 +485,4 @@ def backend(self):
     @property
     def handle(self):
         """Return the program handle object."""
-        return self._mnff.nvrtc_handle
+        return self._mnff.handle
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 627eb2b2..ef6273e7 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -67,7 +67,7 @@ def compile_ltoir_functions(init_cuda):
     + [
         LinkerOptions(arch=ARCH, time=True),
         LinkerOptions(arch=ARCH, optimize_unused_variables=True),
-        LinkerOptions(arch=ARCH, xptxas=["-v"]),
+        LinkerOptions(arch=ARCH, ptxas_options=["-v"]),
         LinkerOptions(arch=ARCH, split_compile=0),
         LinkerOptions(arch=ARCH, split_compile_extended=1),
         # The following options are supported by nvjitlink and deprecated by culink

From ec111b97942e92beed275e27ec969ab91cd0e983 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 4 Feb 2025 00:23:05 -0500
Subject: [PATCH 8/8] fix typo

---
 cuda_core/tests/test_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index a581e75d..556fe9f7 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -70,7 +70,7 @@ def compile_ltoir_functions(init_cuda):
         LinkerOptions(arch=ARCH, kernels_used=["C", "B"]),
         LinkerOptions(arch=ARCH, variables_used=["var1"]),
         LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]),
-    ],
+    ]
     version = nvjitlink.version()
     if version >= (12, 5):
         options.append(LinkerOptions(arch=ARCH, no_cache=True))