From b48762d86fe213ae5d924c222f71c94b0196f6f8 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 19 Dec 2024 11:47:46 -0800 Subject: [PATCH 1/8] support ptx code type for program --- cuda_core/cuda/core/experimental/_program.py | 11 +++++++++-- cuda_core/tests/test_program.py | 14 +++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 0dac79eb..8f7e6a9a 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -39,22 +39,29 @@ def close(self): self.handle = None __slots__ = ("__weakref__", "_mnff", "_backend") - _supported_code_type = ("c++",) + _supported_code_type = ("c++", "ptx") _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type): self._mnff = Program._MembersNeededForFinalize(self, None) + code_type = code_type.lower() if code_type not in self._supported_code_type: raise NotImplementedError - if code_type.lower() == "c++": + if code_type == "c++": if not isinstance(code, str): raise TypeError # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" + + elif code_type == "ptx": + if not isinstance(code, str): + raise TypeError + self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._backend = "nvrtc" else: raise NotImplementedError diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index cca01af5..553ffef3 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -38,11 +38,15 @@ def test_program_compile_valid_target_type(): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") arch = "".join(str(i) for i in Device().compute_capability) - object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",)) - print(object_code._module.decode()) - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) + ptx_object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",)) + program = Program(ptx_object_code.code, "ptx") + cubin_object_code = program.compile("cubin", options=(f"-arch=compute_{arch}",)) + ptx_kernel = ptx_object_code.get_kernel("my_kernel") + cubin_kernel = cubin_object_code.get_kernel("my_kernel") + assert isinstance(ptx_object_code, ObjectCode) + assert isinstance(cubin_object_code, ObjectCode) + assert isinstance(ptx_kernel, Kernel) + assert isinstance(cubin_kernel, Kernel) def test_program_compile_invalid_target_type(): From 7c39f7b80a58f77ff7b8587d049d3769b5f5d777 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Fri, 20 Dec 2024 12:53:47 -0800 Subject: [PATCH 2/8] use linker as the backend when Program takes ptx. WIP needs the ProgramOptions change to go in --- cuda_core/cuda/core/experimental/_program.py | 16 ++++++++++++---- cuda_core/tests/test_program.py | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 8f7e6a9a..d947cfd2 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -5,6 +5,7 @@ import weakref from cuda import nvrtc +from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import handle_return @@ -27,7 +28,7 @@ class Program: """ class _MembersNeededForFinalize: - __slots__ = ("handle",) + __slots__ = "handle" def __init__(self, program_obj, handle): self.handle = handle @@ -38,7 +39,7 @@ def close(self): handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) self.handle = None - __slots__ = ("__weakref__", "_mnff", "_backend") + __slots__ = ("__weakref__", "_mnff", "_backend", "_linker") _supported_code_type = ("c++", "ptx") _supported_target_type = ("ptx", "cubin", "ltoir") @@ -60,11 +61,15 @@ def __init__(self, code, code_type): elif code_type == "ptx": if not isinstance(code, str): raise TypeError - self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) - self._backend = "nvrtc" + # TODO: support pre-loaded headers & include names + # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved + self._linker = Linker(ObjectCode(code.encode(), code_type), options=LinkerOptions(arch="sm_89")) + self._backend = "linker" else: raise NotImplementedError + print(self._backend) + def close(self): """Destroy this program.""" self._mnff.close() @@ -129,6 +134,9 @@ def compile(self, target_type, options=(), name_expressions=(), logs=None): return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) + if self._backend == "linker": + return self._linker.link(target_type) + @property def backend(self): """Return the backend type string associated with this program.""" diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 553ffef3..01ab2724 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -39,7 +39,8 @@ def test_program_compile_valid_target_type(): program = Program(code, "c++") arch = "".join(str(i) for i in Device().compute_capability) ptx_object_code = program.compile("ptx", options=(f"-arch=compute_{arch}",)) - program = Program(ptx_object_code.code, "ptx") + print(ptx_object_code._module.decode()) + program = Program(ptx_object_code._module.decode(), "ptx") cubin_object_code = program.compile("cubin", options=(f"-arch=compute_{arch}",)) ptx_kernel = ptx_object_code.get_kernel("my_kernel") cubin_kernel = cubin_object_code.get_kernel("my_kernel") From 7a8f7735dd4d1955ac36f5272573b276c665ff3c Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 20 Jan 2025 15:39:15 -0800 Subject: [PATCH 3/8] add plumbing --- cuda_core/cuda/core/experimental/_linker.py | 39 +++-------- cuda_core/cuda/core/experimental/_program.py | 37 ----------- cuda_core/tests/test_linker.py | 12 ++-- cuda_core/tests/test_program.py | 69 +++++++++++++++----- 4 files changed, 67 insertions(+), 90 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index b5a6b675..0f737805 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import ctypes -import warnings import weakref from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional +from warnings import warn from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode @@ -43,7 +43,7 @@ def _decide_nvjitlink_or_driver(): _nvjitlink = None if _nvjitlink is None: - warnings.warn( + warn( "nvJitLink is not installed or too old (<12.3). Therefore it is not usable " "and the culink APIs will be used instead.", stacklevel=3, @@ -98,78 +98,59 @@ class LinkerOptions: will be used. max_register_count : int, optional Maximum register count. - Maps to: ``-maxrregcount=``. time : bool, optional Print timing information to the info log. - Maps to ``-time``. Default: False. verbose : bool, optional Print verbose messages to the info log. - Maps to ``-verbose``. Default: False. link_time_optimization : bool, optional Perform link time optimization. - Maps to: ``-lto``. Default: False. ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with ``-lto``. - Maps to ``-ptx``. + Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``. Default: False. optimization_level : int, optional Set optimization level. Only 0 and 3 are accepted. - Maps to ``-O``. debug : bool, optional Generate debug information. - Maps to ``-g`` Default: False. lineinfo : bool, optional Generate line information. - Maps to ``-lineinfo``. Default: False. ftz : bool, optional Flush denormal values to zero. - Maps to ``-ftz=``. Default: False. prec_div : bool, optional Use precise division. - Maps to ``-prec-div=``. Default: True. prec_sqrt : bool, optional Use precise square root. - Maps to ``-prec-sqrt=``. Default: True. fma : bool, optional Use fast multiply-add. - Maps to ``-fma=``. Default: True. kernels_used : List[str], optional Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple times. - Maps to ``-kernels-used=``. variables_used : List[str], optional Pass a list of variables that are used; any not in the list can be removed. - Maps to ``-variables-used=`` optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. - Maps to: ``-optimize-unused-variables`` Default: False. xptxas : List[str], optional Pass options to PTXAS. - Maps to: ``-Xptxas=``. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split compilation (default). - Maps to ``-split-compile=``. Default: 1. split_compile_extended : int, optional A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This option can potentially impact performance of the compiled binary. - Maps to ``-split-compile-extended=``. Default: 1. no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. - Maps to ``-no-cache``. Default: False. """ @@ -290,19 +271,19 @@ def _init_driver(self): self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) if self.ftz is not None: - raise ValueError("ftz option is deprecated in the driver API") + warn("ftz option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.prec_div is not None: - raise ValueError("prec_div option is deprecated in the driver API") + warn("prec_div option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.prec_sqrt is not None: - raise ValueError("prec_sqrt option is deprecated in the driver API") + warn("prec_sqrt option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.fma is not None: - raise ValueError("fma options is deprecated in the driver API") + warn("fma options is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.kernels_used is not None: - raise ValueError("kernels_used is deprecated in the driver API") + warn("kernels_used is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.variables_used is not None: - raise ValueError("variables_used is deprecated in the driver API") + warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.optimize_unused_variables is not None: - raise ValueError("optimize_unused_variables is deprecated in the driver API") + warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.xptxas is not None: raise ValueError("xptxas option is not supported by the driver API") if self.split_compile is not None: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index f0870548..ed7c38c2 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -32,157 +32,120 @@ class ProgramOptions: relocatable_device_code : bool, optional Enable (disable) the generation of relocatable device code. Default: False - Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``) extensible_whole_program : bool, optional Do extensible whole program compilation of device code. Default: False - Maps to: ``--extensible-whole-program`` (``-ewp``) debug : bool, optional Generate debug information. If --dopt is not specified, then turns off all optimizations. Default: False - Maps to: ``--device-debug`` (``-G``) lineinfo: bool, optional Generate line-number information. Default: False - Maps to: ``--generate-line-info`` (``-lineinfo``) device_code_optimize : bool, optional Enable device code optimization. When specified along with ā€˜-Gā€™, enables limited debug information generation for optimized device code. Default: None - Maps to: ``--dopt on`` (``-dopt``) ptxas_options : Union[str, List[str]], optional Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. For example ["-v", "-O2"]. Default: None - Maps to: ``--ptxas-options `` (``-Xptxas``) max_register_count : int, optional Specify the maximum amount of registers that GPU functions can use. Default: None - Maps to: ``--maxrregcount=`` (``-maxrregcount``) ftz : bool, optional When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal values. Default: False - Maps to: ``--ftz={true|false}`` (``-ftz``) prec_sqrt : bool, optional For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``) prec_div : bool, optional For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: ``--prec-div={true|false}`` (``-prec-div``) fma : bool, optional Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations. Default: True - Maps to: ``--fmad={true|false}`` (``-fmad``) use_fast_math : bool, optional Make use of fast math operations. Default: False - Maps to: ``--use_fast_math`` (``-use_fast_math``) extra_device_vectorization : bool, optional Enables more aggressive device code vectorization in the NVVM optimizer. Default: False - Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``) link_time_optimization : bool, optional Generate intermediate code for later link-time optimization. Default: False - Maps to: ``--dlink-time-opt`` (``-dlto``) gen_opt_lto : bool, optional Run the optimizer passes before generating the LTO IR. Default: False - Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``) define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of strings, in which case the first element is defined as the second, or a list of strings or tuples. Default: None - Maps to: ``--define-macro=`` (``-D``) undefine_macro : Union[str, List[str]], optional Cancel any previous definition of a macro, or list of macros. Default: None - Maps to: ``--undefine-macro=`` (``-U``) include_path : Union[str, List[str]], optional Add the directory or directories to the list of directories to be searched for headers. Default: None - Maps to: ``--include-path=`` (``-I``) pre_include : Union[str, List[str]], optional Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. Default: None - Maps to: ``--pre-include=
`` (``-include``) no_source_include : bool, optional Disable the default behavior of adding the directory of each input source to the include path. Default: False - Maps to: ``--no-source-include`` (``-no-source-include``) std : str, optional Set language dialect to C++03, C++11, C++14, C++17 or C++20. Default: c++17 - Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``) builtin_move_forward : bool, optional Provide builtin definitions of std::move and std::forward. Default: True - Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``) builtin_initializer_list : bool, optional Provide builtin definitions of std::initializer_list class and member functions. Default: True - Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``) disable_warnings : bool, optional Inhibit all warning messages. Default: False - Maps to: ``--disable-warnings`` (``-w``) restrict : bool, optional Programmer assertion that all kernel pointer parameters are restrict pointers. Default: False - Maps to: ``--restrict`` (``-restrict``) device_as_default_execution_space : bool, optional Treat entities with no execution space annotation as __device__ entities. Default: False - Maps to: ``--device-as-default-execution-space`` (``-default-device``) device_int128 : bool, optional Allow the __int128 type in device code. Default: False - Maps to: ``--device-int128`` (``-device-int128``) optimization_info : str, optional Provide optimization reports for the specified kind of optimization. Default: None - Maps to: ``--optimization-info=`` (``-opt-info``) no_display_error_number : bool, optional Disable the display of a diagnostic number for warning messages. Default: False - Maps to: ``--no-display-error-number`` (``-no-err-no``) diag_error : Union[int, List[int]], optional Emit error for a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: ``--diag-error=, ...`` (``-diag-error``) diag_suppress : Union[int, List[int]], optional Suppress a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: ``--diag-suppress=,ā€¦`` (``-diag-suppress``) diag_warn : Union[int, List[int]], optional Emit warning for a specified diagnostic message number or comma separated lis of numbers. Default: None - Maps to: ``--diag-warn=,ā€¦`` (``-diag-warn``) brief_diagnostics : bool, optional Disable or enable showing source line and column info in a diagnostic. Default: False - Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``) time : str, optional Generate a CSV table with the time taken by each compilation phase. Default: None - Maps to: ``--time=`` (``-time``) split_compile : int, optional Perform compiler optimizations in parallel. Default: 1 - Maps to: ``--split-compile= `` (``-split-compile``) fdevice_syntax_only : bool, optional Ends device compilation after front-end syntax checking. Default: False - Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``) minimal : bool, optional Omit certain language features to reduce compile time for small programs. Default: False - Maps to: ``--minimal`` (``-minimal``) """ arch: Optional[str] = None diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index b81c1654..0a1fffc5 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -52,8 +52,7 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), - LinkerOptions(arch=ARCH, no_cache=True), # TODO: consider adding cuda 12.4 to test matrix in which case this + LinkerOptions(arch=ARCH, lineinfo=True), # TODO: consider adding cuda 12.4 to test matrix in which case this # will fail. Tracked in issue #337 ] @@ -65,6 +64,11 @@ def compile_ltoir_functions(init_cuda): else culink_options + [ LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + # The following options are supported by nvjitlink and deprecated by culink LinkerOptions(arch=ARCH, ftz=True), LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), @@ -73,10 +77,6 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, kernels_used=["C", "B"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), ], ) def test_linker_init(compile_ptx_functions, options): diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 0f9b8e3b..c0769ad1 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -9,16 +9,14 @@ import pytest from conftest import can_load_generated_ptx +from cuda.core.experimental import _linker from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions -# TODO handle and test options whcih are only supported on more modern CUDA versions -# tracked in #337 -def test_program_with_various_options(init_cuda): - code = 'extern "C" __global__ void my_kernel() {}' - - options_list = [ +@pytest.mark.parametrize( + "options", + [ ProgramOptions(device_code_optimize=True, debug=True), ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), @@ -38,14 +36,44 @@ def test_program_with_various_options(init_cuda): ProgramOptions(diag_error=1234, diag_suppress=1234), ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)), ProgramOptions(diag_warn=1000), - ] + ], +) +def test_cpp_program_with_various_options(init_cuda, options): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++", options) + assert program.backend == "nvrtc" + program.compile("ptx") + program.close() + assert program.handle is None - for options in options_list: - program = Program(code, "c++", options) - assert program.backend == "nvrtc" - program.compile("ptx") - program.close() - assert program.handle is None + +culink_options = [ + ProgramOptions(max_register_count=32), + ProgramOptions(debug=True), + ProgramOptions(lineinfo=True), + ProgramOptions(ftz=True), + ProgramOptions(prec_div=True), + ProgramOptions(prec_sqrt=True), + ProgramOptions(fma=True), +] +nvjitlink_options = [ + ProgramOptions(time=True), + ProgramOptions(split_compile=True), +] + + +@pytest.mark.parametrize( + "options", culink_options if _linker._decide_nvjitlink_or_driver() else culink_options + nvjitlink_options +) +def test_ptx_program_with_various_options(init_cuda, options): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + ptx_object_code = program.compile("ptx") + program = Program(ptx_object_code._module.decode(), "ptx", options=options) + assert program.backend == "linker" + program.compile("cubin") + program.close() + assert program.handle is None def test_program_init_valid_code_type(): @@ -70,13 +98,18 @@ def test_program_init_invalid_code_format(): # TODO: incorporate this check in Program # This is tested against the current device's arch @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") -def test_program_compile_valid_target_type(): +def test_program_compile_valid_target_type(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") - object_code = program.compile("ptx") - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) + ptx_object_code = program.compile("ptx") + program = Program(ptx_object_code._module.decode(), "ptx") + cubin_object_code = program.compile("cubin") + ptx_kernel = ptx_object_code.get_kernel("my_kernel") + cubin_kernel = cubin_object_code.get_kernel("my_kernel") + assert isinstance(ptx_object_code, ObjectCode) + assert isinstance(cubin_object_code, ObjectCode) + assert isinstance(ptx_kernel, Kernel) + assert isinstance(cubin_kernel, Kernel) def test_program_compile_invalid_target_type(): From 41bfab7f33e17b46b24b3a494518780c49f3f411 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 20 Jan 2025 15:41:25 -0800 Subject: [PATCH 4/8] remove redundant todo --- cuda_core/cuda/core/experimental/_program.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index ed7c38c2..2f4ead64 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -384,8 +384,6 @@ def __init__(self, code, code_type, options: ProgramOptions = None): elif code_type == "ptx": if not isinstance(code, str): raise TypeError - # TODO: support pre-loaded headers & include names - # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved self._linker = Linker( ObjectCode(code.encode(), code_type), options=self._translate_program_options(options) ) From af9b5477d67692c9522f21e2bf9b8e61205f16a5 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Tue, 28 Jan 2025 11:54:39 -0800 Subject: [PATCH 5/8] fix bug --- cuda_core/cuda/core/experimental/_linker.py | 4 ++-- cuda_core/tests/test_program.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 0f737805..3eb71f4e 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -23,11 +23,11 @@ # Note: this function is reused in the tests -def _decide_nvjitlink_or_driver(): +def _decide_nvjitlink_or_driver() -> bool: """Returns True if falling back to the cuLink* driver APIs.""" global _driver_ver, _driver, _nvjitlink if _driver or _nvjitlink: - return + return _driver is not None _driver_ver = handle_return(driver.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index c0769ad1..11b2e854 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -47,7 +47,7 @@ def test_cpp_program_with_various_options(init_cuda, options): assert program.handle is None -culink_options = [ +options = [ ProgramOptions(max_register_count=32), ProgramOptions(debug=True), ProgramOptions(lineinfo=True), @@ -56,15 +56,15 @@ def test_cpp_program_with_various_options(init_cuda, options): ProgramOptions(prec_sqrt=True), ProgramOptions(fma=True), ] -nvjitlink_options = [ - ProgramOptions(time=True), - ProgramOptions(split_compile=True), -] +if not _linker._decide_nvjitlink_or_driver(): + print("Using nvjitlink as the backend because decide() returned false") + options += [ + ProgramOptions(time=True), + ProgramOptions(split_compile=True), + ] -@pytest.mark.parametrize( - "options", culink_options if _linker._decide_nvjitlink_or_driver() else culink_options + nvjitlink_options -) +@pytest.mark.parametrize("options", options) def test_ptx_program_with_various_options(init_cuda, options): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") From cde6fdff695f893f58ae11631d71284e32b28028 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 3 Feb 2025 11:30:45 -0800 Subject: [PATCH 6/8] various changes to address review comments --- cuda_core/cuda/core/experimental/_linker.py | 12 ++--- cuda_core/cuda/core/experimental/_program.py | 49 +++++++++++--------- cuda_core/tests/test_linker.py | 4 +- cuda_core/tests/test_program.py | 15 ++++-- 4 files changed, 47 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 3eb71f4e..976d739f 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -138,7 +138,7 @@ class LinkerOptions: optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. Default: False. - xptxas : List[str], optional + ptxas_options : List[str], optional Pass options to PTXAS. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split @@ -170,7 +170,7 @@ class LinkerOptions: kernels_used: Optional[List[str]] = None variables_used: Optional[List[str]] = None optimize_unused_variables: Optional[bool] = None - xptxas: Optional[List[str]] = None + ptxas_options: Optional[List[str]] = None split_compile: Optional[int] = None split_compile_extended: Optional[int] = None no_cache: Optional[bool] = None @@ -220,8 +220,8 @@ def _init_nvjitlink(self): self.formatted_options.append(f"-variables-used={variable}") if self.optimize_unused_variables is not None: self.formatted_options.append("-optimize-unused-variables") - if self.xptxas is not None: - for opt in self.xptxas: + if self.ptxas_options is not None: + for opt in self.ptxas_options: self.formatted_options.append(f"-Xptxas={opt}") if self.split_compile is not None: self.formatted_options.append(f"-split-compile={self.split_compile}") @@ -284,8 +284,8 @@ def _init_driver(self): warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.optimize_unused_variables is not None: warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3) - if self.xptxas is not None: - raise ValueError("xptxas option is not supported by the driver API") + if self.ptxas_options is not None: + raise ValueError("ptxas_options option is not supported by the driver API") if self.split_compile is not None: raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 2f4ead64..fc47ff85 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -342,23 +342,23 @@ class Program: code : Any String of the CUDA Runtime Compilation program. code_type : Any - String of the code type. Currently only ``"c++"`` is supported. + String of the code type. Currently ``"ptx"`` and ``"c++"`` are supported. options : ProgramOptions, optional A ProgramOptions object to customize the compilation process. See :obj:`ProgramOptions` for more information. """ class _MembersNeededForFinalize: - __slots__ = "handle" + __slots__ = "nvrtc_handle" - def __init__(self, program_obj, handle): - self.handle = handle + def __init__(self, program_obj, nvrtc_handle): + self.nvrtc_handle = nvrtc_handle weakref.finalize(program_obj, self.close) def close(self): - if self.handle is not None: - handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) - self.handle = None + if self.nvrtc_handle is not None: + handle_return(nvrtc.nvrtcDestroyProgram(self.nvrtc_handle)) + self.nvrtc_handle = None __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options") _supported_code_type = ("c++", "ptx") @@ -375,15 +375,16 @@ def __init__(self, code, code_type, options: ProgramOptions = None): if code_type == "c++": if not isinstance(code, str): - raise TypeError + raise TypeError("c++ Program expects code argument to be a string") # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._mnff.nvrtc_handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" + self._linker = None elif code_type == "ptx": if not isinstance(code, str): - raise TypeError + raise TypeError("ptx Program expects code argument to be a string") self._linker = Linker( ObjectCode(code.encode(), code_type), options=self._translate_program_options(options) ) @@ -404,6 +405,7 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions: fma=options.fma, link_time_optimization=options.link_time_optimization, split_compile=options.split_compile, + ptxas_options=options.ptxas_options, ) def close(self): @@ -438,35 +440,40 @@ def compile(self, target_type, name_expressions=(), logs=None): if self._backend == "nvrtc": if name_expressions: for n in name_expressions: - handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) + handle_return( + nvrtc.nvrtcAddNameExpression(self._mnff.nvrtc_handle, n.encode()), + handle=self._mnff.nvrtc_handle, + ) options = self._options._as_bytes() handle_return( - nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), - handle=self._mnff.handle, + nvrtc.nvrtcCompileProgram(self._mnff.nvrtc_handle, len(options), options), + handle=self._mnff.nvrtc_handle, ) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") - size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle) + size = handle_return(size_func(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle) data = b" " * size - handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle) + handle_return(comp_func(self._mnff.nvrtc_handle, data), handle=self._mnff.nvrtc_handle) symbol_mapping = {} if name_expressions: for n in name_expressions: symbol_mapping[n] = handle_return( - nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle + nvrtc.nvrtcGetLoweredName(self._mnff.nvrtc_handle, n.encode()), handle=self._mnff.nvrtc_handle ) if logs is not None: - logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle) + logsize = handle_return( + nvrtc.nvrtcGetProgramLogSize(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle + ) if logsize > 1: log = b" " * logsize - handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) + handle_return( + nvrtc.nvrtcGetProgramLog(self._mnff.nvrtc_handle, log), handle=self._mnff.nvrtc_handle + ) logs.write(log.decode()) - # TODO: handle jit_options for ptx? - return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) if self._backend == "linker": @@ -480,4 +487,4 @@ def backend(self): @property def handle(self): """Return the program handle object.""" - return self._mnff.handle + return self._mnff.nvrtc_handle diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 0a1fffc5..627eb2b2 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -52,7 +52,9 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, max_register_count=32), LinkerOptions(arch=ARCH, optimization_level=3), LinkerOptions(arch=ARCH, debug=True), - LinkerOptions(arch=ARCH, lineinfo=True), # TODO: consider adding cuda 12.4 to test matrix in which case this + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, lineinfo=True), + LinkerOptions(arch=ARCH, no_cache=True), # TODO: consider adding cuda 12.4 to test matrix in which case this # will fail. Tracked in issue #337 ] diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 11b2e854..d9c5cbde 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -14,6 +14,14 @@ from cuda.core.experimental._program import Program, ProgramOptions +@pytest.fixture(scope="module") +def ptx_code_object(): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + ptx_object_code = program.compile("ptx") + return ptx_object_code + + @pytest.mark.parametrize( "options", [ @@ -65,11 +73,8 @@ def test_cpp_program_with_various_options(init_cuda, options): @pytest.mark.parametrize("options", options) -def test_ptx_program_with_various_options(init_cuda, options): - code = 'extern "C" __global__ void my_kernel() {}' - program = Program(code, "c++") - ptx_object_code = program.compile("ptx") - program = Program(ptx_object_code._module.decode(), "ptx", options=options) +def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options): + program = Program(ptx_code_object._module.decode(), "ptx", options=options) assert program.backend == "linker" program.compile("cubin") program.close() From e079135a728001982291a1e0b50b5f3eac2c14ea Mon Sep 17 00:00:00 2001 From: ksimpson Date: Mon, 3 Feb 2025 14:02:57 -0800 Subject: [PATCH 7/8] fix test bug and forec finalize the linker --- cuda_core/cuda/core/experimental/_program.py | 40 ++++++++++---------- cuda_core/tests/test_linker.py | 2 +- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index fc47ff85..f938895e 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -349,16 +349,16 @@ class Program: """ class _MembersNeededForFinalize: - __slots__ = "nvrtc_handle" + __slots__ = "handle" - def __init__(self, program_obj, nvrtc_handle): - self.nvrtc_handle = nvrtc_handle + def __init__(self, program_obj, handle): + self.handle = handle weakref.finalize(program_obj, self.close) def close(self): - if self.nvrtc_handle is not None: - handle_return(nvrtc.nvrtcDestroyProgram(self.nvrtc_handle)) - self.nvrtc_handle = None + if self.handle is not None: + handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) + self.handle = None __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options") _supported_code_type = ("c++", "ptx") @@ -378,7 +378,7 @@ def __init__(self, code, code_type, options: ProgramOptions = None): raise TypeError("c++ Program expects code argument to be a string") # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved - self._mnff.nvrtc_handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) + self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" self._linker = None @@ -410,6 +410,8 @@ def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions: def close(self): """Destroy this program.""" + if self._linker: + self._linker.close() self._mnff.close() def compile(self, target_type, name_expressions=(), logs=None): @@ -441,37 +443,33 @@ def compile(self, target_type, name_expressions=(), logs=None): if name_expressions: for n in name_expressions: handle_return( - nvrtc.nvrtcAddNameExpression(self._mnff.nvrtc_handle, n.encode()), - handle=self._mnff.nvrtc_handle, + nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), + handle=self._mnff.handle, ) options = self._options._as_bytes() handle_return( - nvrtc.nvrtcCompileProgram(self._mnff.nvrtc_handle, len(options), options), - handle=self._mnff.nvrtc_handle, + nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), + handle=self._mnff.handle, ) size_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}Size") comp_func = getattr(nvrtc, f"nvrtcGet{target_type.upper()}") - size = handle_return(size_func(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle) + size = handle_return(size_func(self._mnff.handle), handle=self._mnff.handle) data = b" " * size - handle_return(comp_func(self._mnff.nvrtc_handle, data), handle=self._mnff.nvrtc_handle) + handle_return(comp_func(self._mnff.handle, data), handle=self._mnff.handle) symbol_mapping = {} if name_expressions: for n in name_expressions: symbol_mapping[n] = handle_return( - nvrtc.nvrtcGetLoweredName(self._mnff.nvrtc_handle, n.encode()), handle=self._mnff.nvrtc_handle + nvrtc.nvrtcGetLoweredName(self._mnff.handle, n.encode()), handle=self._mnff.handle ) if logs is not None: - logsize = handle_return( - nvrtc.nvrtcGetProgramLogSize(self._mnff.nvrtc_handle), handle=self._mnff.nvrtc_handle - ) + logsize = handle_return(nvrtc.nvrtcGetProgramLogSize(self._mnff.handle), handle=self._mnff.handle) if logsize > 1: log = b" " * logsize - handle_return( - nvrtc.nvrtcGetProgramLog(self._mnff.nvrtc_handle, log), handle=self._mnff.nvrtc_handle - ) + handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode()) return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) @@ -487,4 +485,4 @@ def backend(self): @property def handle(self): """Return the program handle object.""" - return self._mnff.nvrtc_handle + return self._mnff.handle diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 627eb2b2..ef6273e7 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -67,7 +67,7 @@ def compile_ltoir_functions(init_cuda): + [ LinkerOptions(arch=ARCH, time=True), LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), + LinkerOptions(arch=ARCH, ptxas_options=["-v"]), LinkerOptions(arch=ARCH, split_compile=0), LinkerOptions(arch=ARCH, split_compile_extended=1), # The following options are supported by nvjitlink and deprecated by culink From ec111b97942e92beed275e27ec969ab91cd0e983 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 4 Feb 2025 00:23:05 -0500 Subject: [PATCH 8/8] fix typo --- cuda_core/tests/test_linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index a581e75d..556fe9f7 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -70,7 +70,7 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, kernels_used=["C", "B"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), - ], + ] version = nvjitlink.version() if version >= (12, 5): options.append(LinkerOptions(arch=ARCH, no_cache=True))