diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index b5a6b675..976d739f 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -3,11 +3,11 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import ctypes -import warnings import weakref from contextlib import contextmanager from dataclasses import dataclass from typing import List, Optional +from warnings import warn from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode @@ -23,11 +23,11 @@ # Note: this function is reused in the tests -def _decide_nvjitlink_or_driver(): +def _decide_nvjitlink_or_driver() -> bool: """Returns True if falling back to the cuLink* driver APIs.""" global _driver_ver, _driver, _nvjitlink if _driver or _nvjitlink: - return + return _driver is not None _driver_ver = handle_return(driver.cuDriverGetVersion()) _driver_ver = (_driver_ver // 1000, (_driver_ver % 1000) // 10) @@ -43,7 +43,7 @@ def _decide_nvjitlink_or_driver(): _nvjitlink = None if _nvjitlink is None: - warnings.warn( + warn( "nvJitLink is not installed or too old (<12.3). Therefore it is not usable " "and the culink APIs will be used instead.", stacklevel=3, @@ -98,78 +98,59 @@ class LinkerOptions: will be used. max_register_count : int, optional Maximum register count. - Maps to: ``-maxrregcount=``. time : bool, optional Print timing information to the info log. - Maps to ``-time``. Default: False. verbose : bool, optional Print verbose messages to the info log. - Maps to ``-verbose``. Default: False. link_time_optimization : bool, optional Perform link time optimization. - Maps to: ``-lto``. Default: False. ptx : bool, optional - Emit PTX after linking instead of CUBIN; only supported with ``-lto``. - Maps to ``-ptx``. + Emit PTX after linking instead of CUBIN; only supported with ``link_time_optimization=True``. Default: False. optimization_level : int, optional Set optimization level. Only 0 and 3 are accepted. - Maps to ``-O``. debug : bool, optional Generate debug information. - Maps to ``-g`` Default: False. lineinfo : bool, optional Generate line information. - Maps to ``-lineinfo``. Default: False. ftz : bool, optional Flush denormal values to zero. - Maps to ``-ftz=``. Default: False. prec_div : bool, optional Use precise division. - Maps to ``-prec-div=``. Default: True. prec_sqrt : bool, optional Use precise square root. - Maps to ``-prec-sqrt=``. Default: True. fma : bool, optional Use fast multiply-add. - Maps to ``-fma=``. Default: True. kernels_used : List[str], optional Pass list of kernels that are used; any not in the list can be removed. This option can be specified multiple times. - Maps to ``-kernels-used=``. variables_used : List[str], optional Pass a list of variables that are used; any not in the list can be removed. - Maps to ``-variables-used=`` optimize_unused_variables : bool, optional Assume that if a variable is not referenced in device code, it can be removed. - Maps to: ``-optimize-unused-variables`` Default: False. - xptxas : List[str], optional + ptxas_options : List[str], optional Pass options to PTXAS. - Maps to: ``-Xptxas=``. split_compile : int, optional Split compilation maximum thread count. Use 0 to use all available processors. Value of 1 disables split compilation (default). - Maps to ``-split-compile=``. Default: 1. split_compile_extended : int, optional A more aggressive form of split compilation available in LTO mode only. Accepts a maximum thread count value. Use 0 to use all available processors. Value of 1 disables extended split compilation (default). Note: This option can potentially impact performance of the compiled binary. - Maps to ``-split-compile-extended=``. Default: 1. no_cache : bool, optional Do not cache the intermediate steps of nvJitLink. - Maps to ``-no-cache``. Default: False. """ @@ -189,7 +170,7 @@ class LinkerOptions: kernels_used: Optional[List[str]] = None variables_used: Optional[List[str]] = None optimize_unused_variables: Optional[bool] = None - xptxas: Optional[List[str]] = None + ptxas_options: Optional[List[str]] = None split_compile: Optional[int] = None split_compile_extended: Optional[int] = None no_cache: Optional[bool] = None @@ -239,8 +220,8 @@ def _init_nvjitlink(self): self.formatted_options.append(f"-variables-used={variable}") if self.optimize_unused_variables is not None: self.formatted_options.append("-optimize-unused-variables") - if self.xptxas is not None: - for opt in self.xptxas: + if self.ptxas_options is not None: + for opt in self.ptxas_options: self.formatted_options.append(f"-Xptxas={opt}") if self.split_compile is not None: self.formatted_options.append(f"-split-compile={self.split_compile}") @@ -290,21 +271,21 @@ def _init_driver(self): self.formatted_options.append(1) self.option_keys.append(_driver.CUjit_option.CU_JIT_GENERATE_LINE_INFO) if self.ftz is not None: - raise ValueError("ftz option is deprecated in the driver API") + warn("ftz option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.prec_div is not None: - raise ValueError("prec_div option is deprecated in the driver API") + warn("prec_div option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.prec_sqrt is not None: - raise ValueError("prec_sqrt option is deprecated in the driver API") + warn("prec_sqrt option is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.fma is not None: - raise ValueError("fma options is deprecated in the driver API") + warn("fma options is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.kernels_used is not None: - raise ValueError("kernels_used is deprecated in the driver API") + warn("kernels_used is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.variables_used is not None: - raise ValueError("variables_used is deprecated in the driver API") + warn("variables_used is deprecated in the driver API", DeprecationWarning, stacklevel=3) if self.optimize_unused_variables is not None: - raise ValueError("optimize_unused_variables is deprecated in the driver API") - if self.xptxas is not None: - raise ValueError("xptxas option is not supported by the driver API") + warn("optimize_unused_variables is deprecated in the driver API", DeprecationWarning, stacklevel=3) + if self.ptxas_options is not None: + raise ValueError("ptxas_options option is not supported by the driver API") if self.split_compile is not None: raise ValueError("split_compile option is not supported by the driver API") if self.split_compile_extended is not None: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index c10fd077..f938895e 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -7,6 +7,7 @@ from typing import List, Optional, Tuple, Union from cuda.core.experimental._device import Device +from cuda.core.experimental._linker import Linker, LinkerOptions from cuda.core.experimental._module import ObjectCode from cuda.core.experimental._utils import ( _handle_boolean_option, @@ -31,157 +32,120 @@ class ProgramOptions: relocatable_device_code : bool, optional Enable (disable) the generation of relocatable device code. Default: False - Maps to: ``--relocatable-device-code={true|false}`` (``-rdc``) extensible_whole_program : bool, optional Do extensible whole program compilation of device code. Default: False - Maps to: ``--extensible-whole-program`` (``-ewp``) debug : bool, optional Generate debug information. If --dopt is not specified, then turns off all optimizations. Default: False - Maps to: ``--device-debug`` (``-G``) lineinfo: bool, optional Generate line-number information. Default: False - Maps to: ``--generate-line-info`` (``-lineinfo``) device_code_optimize : bool, optional Enable device code optimization. When specified along with ā€˜-Gā€™, enables limited debug information generation for optimized device code. Default: None - Maps to: ``--dopt on`` (``-dopt``) ptxas_options : Union[str, List[str]], optional Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings. For example ["-v", "-O2"]. Default: None - Maps to: ``--ptxas-options `` (``-Xptxas``) max_register_count : int, optional Specify the maximum amount of registers that GPU functions can use. Default: None - Maps to: ``--maxrregcount=`` (``-maxrregcount``) ftz : bool, optional When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal values. Default: False - Maps to: ``--ftz={true|false}`` (``-ftz``) prec_sqrt : bool, optional For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: ``--prec-sqrt={true|false}`` (``-prec-sqrt``) prec_div : bool, optional For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster approximation. Default: True - Maps to: ``--prec-div={true|false}`` (``-prec-div``) fma : bool, optional Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations. Default: True - Maps to: ``--fmad={true|false}`` (``-fmad``) use_fast_math : bool, optional Make use of fast math operations. Default: False - Maps to: ``--use_fast_math`` (``-use_fast_math``) extra_device_vectorization : bool, optional Enables more aggressive device code vectorization in the NVVM optimizer. Default: False - Maps to: ``--extra-device-vectorization`` (``-extra-device-vectorization``) link_time_optimization : bool, optional Generate intermediate code for later link-time optimization. Default: False - Maps to: ``--dlink-time-opt`` (``-dlto``) gen_opt_lto : bool, optional Run the optimizer passes before generating the LTO IR. Default: False - Maps to: ``--gen-opt-lto`` (``-gen-opt-lto``) define_macro : Union[str, Tuple[str, str], List[Union[str, Tuple[str, str]]]], optional Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of strings, in which case the first element is defined as the second, or a list of strings or tuples. Default: None - Maps to: ``--define-macro=`` (``-D``) undefine_macro : Union[str, List[str]], optional Cancel any previous definition of a macro, or list of macros. Default: None - Maps to: ``--undefine-macro=`` (``-U``) include_path : Union[str, List[str]], optional Add the directory or directories to the list of directories to be searched for headers. Default: None - Maps to: ``--include-path=`` (``-I``) pre_include : Union[str, List[str]], optional Preinclude one or more headers during preprocessing. Can be either a string or a list of strings. Default: None - Maps to: ``--pre-include=
`` (``-include``) no_source_include : bool, optional Disable the default behavior of adding the directory of each input source to the include path. Default: False - Maps to: ``--no-source-include`` (``-no-source-include``) std : str, optional Set language dialect to C++03, C++11, C++14, C++17 or C++20. Default: c++17 - Maps to: ``--std={c++03|c++11|c++14|c++17|c++20}`` (``-std``) builtin_move_forward : bool, optional Provide builtin definitions of std::move and std::forward. Default: True - Maps to: ``--builtin-move-forward={true|false}`` (``-builtin-move-forward``) builtin_initializer_list : bool, optional Provide builtin definitions of std::initializer_list class and member functions. Default: True - Maps to: ``--builtin-initializer-list={true|false}`` (``-builtin-initializer-list``) disable_warnings : bool, optional Inhibit all warning messages. Default: False - Maps to: ``--disable-warnings`` (``-w``) restrict : bool, optional Programmer assertion that all kernel pointer parameters are restrict pointers. Default: False - Maps to: ``--restrict`` (``-restrict``) device_as_default_execution_space : bool, optional Treat entities with no execution space annotation as __device__ entities. Default: False - Maps to: ``--device-as-default-execution-space`` (``-default-device``) device_int128 : bool, optional Allow the __int128 type in device code. Default: False - Maps to: ``--device-int128`` (``-device-int128``) optimization_info : str, optional Provide optimization reports for the specified kind of optimization. Default: None - Maps to: ``--optimization-info=`` (``-opt-info``) no_display_error_number : bool, optional Disable the display of a diagnostic number for warning messages. Default: False - Maps to: ``--no-display-error-number`` (``-no-err-no``) diag_error : Union[int, List[int]], optional Emit error for a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: ``--diag-error=, ...`` (``-diag-error``) diag_suppress : Union[int, List[int]], optional Suppress a specified diagnostic message number or comma separated list of numbers. Default: None - Maps to: ``--diag-suppress=,ā€¦`` (``-diag-suppress``) diag_warn : Union[int, List[int]], optional Emit warning for a specified diagnostic message number or comma separated lis of numbers. Default: None - Maps to: ``--diag-warn=,ā€¦`` (``-diag-warn``) brief_diagnostics : bool, optional Disable or enable showing source line and column info in a diagnostic. Default: False - Maps to: ``--brief-diagnostics={true|false}`` (``-brief-diag``) time : str, optional Generate a CSV table with the time taken by each compilation phase. Default: None - Maps to: ``--time=`` (``-time``) split_compile : int, optional Perform compiler optimizations in parallel. Default: 1 - Maps to: ``--split-compile= `` (``-split-compile``) fdevice_syntax_only : bool, optional Ends device compilation after front-end syntax checking. Default: False - Maps to: ``--fdevice-syntax-only`` (``-fdevice-syntax-only``) minimal : bool, optional Omit certain language features to reduce compile time for small programs. Default: False - Maps to: ``--minimal`` (``-minimal``) """ arch: Optional[str] = None @@ -378,14 +342,14 @@ class Program: code : Any String of the CUDA Runtime Compilation program. code_type : Any - String of the code type. Currently only ``"c++"`` is supported. + String of the code type. Currently ``"ptx"`` and ``"c++"`` are supported. options : ProgramOptions, optional A ProgramOptions object to customize the compilation process. See :obj:`ProgramOptions` for more information. """ class _MembersNeededForFinalize: - __slots__ = ("handle",) + __slots__ = "handle" def __init__(self, program_obj, handle): self.handle = handle @@ -396,30 +360,58 @@ def close(self): handle_return(nvrtc.nvrtcDestroyProgram(self.handle)) self.handle = None - __slots__ = ("__weakref__", "_mnff", "_backend", "_options") - _supported_code_type = ("c++",) + __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options") + _supported_code_type = ("c++", "ptx") _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff = Program._MembersNeededForFinalize(self, None) self._options = options = check_or_create_options(ProgramOptions, options, "Program options") + code_type = code_type.lower() if code_type not in self._supported_code_type: raise NotImplementedError - if code_type.lower() == "c++": + if code_type == "c++": if not isinstance(code, str): - raise TypeError + raise TypeError("c++ Program expects code argument to be a string") # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved self._mnff.handle = handle_return(nvrtc.nvrtcCreateProgram(code.encode(), b"", 0, [], [])) self._backend = "nvrtc" + self._linker = None + + elif code_type == "ptx": + if not isinstance(code, str): + raise TypeError("ptx Program expects code argument to be a string") + self._linker = Linker( + ObjectCode(code.encode(), code_type), options=self._translate_program_options(options) + ) + self._backend = "linker" else: raise NotImplementedError + def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions: + return LinkerOptions( + arch=options.arch, + max_register_count=options.max_register_count, + time=options.time, + debug=options.debug, + lineinfo=options.lineinfo, + ftz=options.ftz, + prec_div=options.prec_div, + prec_sqrt=options.prec_sqrt, + fma=options.fma, + link_time_optimization=options.link_time_optimization, + split_compile=options.split_compile, + ptxas_options=options.ptxas_options, + ) + def close(self): """Destroy this program.""" + if self._linker: + self._linker.close() self._mnff.close() def compile(self, target_type, name_expressions=(), logs=None): @@ -450,7 +442,10 @@ def compile(self, target_type, name_expressions=(), logs=None): if self._backend == "nvrtc": if name_expressions: for n in name_expressions: - handle_return(nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), handle=self._mnff.handle) + handle_return( + nvrtc.nvrtcAddNameExpression(self._mnff.handle, n.encode()), + handle=self._mnff.handle, + ) options = self._options._as_bytes() handle_return( nvrtc.nvrtcCompileProgram(self._mnff.handle, len(options), options), @@ -477,10 +472,11 @@ def compile(self, target_type, name_expressions=(), logs=None): handle_return(nvrtc.nvrtcGetProgramLog(self._mnff.handle, log), handle=self._mnff.handle) logs.write(log.decode()) - # TODO: handle jit_options for ptx? - return ObjectCode(data, target_type, symbol_mapping=symbol_mapping) + if self._backend == "linker": + return self._linker.link(target_type) + @property def backend(self): """Return the backend type string associated with this program.""" diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 288e2ce4..556fe9f7 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -57,6 +57,11 @@ def compile_ltoir_functions(init_cuda): if not culink_backend: options += [ LinkerOptions(arch=ARCH, time=True), + LinkerOptions(arch=ARCH, optimize_unused_variables=True), + LinkerOptions(arch=ARCH, ptxas_options=["-v"]), + LinkerOptions(arch=ARCH, split_compile=0), + LinkerOptions(arch=ARCH, split_compile_extended=1), + # The following options are supported by nvjitlink and deprecated by culink LinkerOptions(arch=ARCH, ftz=True), LinkerOptions(arch=ARCH, prec_div=True), LinkerOptions(arch=ARCH, prec_sqrt=True), @@ -65,14 +70,10 @@ def compile_ltoir_functions(init_cuda): LinkerOptions(arch=ARCH, kernels_used=["C", "B"]), LinkerOptions(arch=ARCH, variables_used=["var1"]), LinkerOptions(arch=ARCH, variables_used=["var1", "var2"]), - LinkerOptions(arch=ARCH, optimize_unused_variables=True), - LinkerOptions(arch=ARCH, xptxas=["-v"]), - LinkerOptions(arch=ARCH, split_compile=0), - LinkerOptions(arch=ARCH, split_compile_extended=1), ] version = nvjitlink.version() if version >= (12, 5): - options += [LinkerOptions(arch=ARCH, no_cache=True)] + options.append(LinkerOptions(arch=ARCH, no_cache=True)) @pytest.mark.parametrize("options", options) diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 0f9b8e3b..d9c5cbde 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -9,16 +9,22 @@ import pytest from conftest import can_load_generated_ptx +from cuda.core.experimental import _linker from cuda.core.experimental._module import Kernel, ObjectCode from cuda.core.experimental._program import Program, ProgramOptions -# TODO handle and test options whcih are only supported on more modern CUDA versions -# tracked in #337 -def test_program_with_various_options(init_cuda): +@pytest.fixture(scope="module") +def ptx_code_object(): code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++") + ptx_object_code = program.compile("ptx") + return ptx_object_code + - options_list = [ +@pytest.mark.parametrize( + "options", + [ ProgramOptions(device_code_optimize=True, debug=True), ProgramOptions(relocatable_device_code=True, max_register_count=32), ProgramOptions(ftz=True, prec_sqrt=False, prec_div=False), @@ -38,14 +44,41 @@ def test_program_with_various_options(init_cuda): ProgramOptions(diag_error=1234, diag_suppress=1234), ProgramOptions(diag_error=[1234, 1223], diag_suppress=(1234, 1223)), ProgramOptions(diag_warn=1000), + ], +) +def test_cpp_program_with_various_options(init_cuda, options): + code = 'extern "C" __global__ void my_kernel() {}' + program = Program(code, "c++", options) + assert program.backend == "nvrtc" + program.compile("ptx") + program.close() + assert program.handle is None + + +options = [ + ProgramOptions(max_register_count=32), + ProgramOptions(debug=True), + ProgramOptions(lineinfo=True), + ProgramOptions(ftz=True), + ProgramOptions(prec_div=True), + ProgramOptions(prec_sqrt=True), + ProgramOptions(fma=True), +] +if not _linker._decide_nvjitlink_or_driver(): + print("Using nvjitlink as the backend because decide() returned false") + options += [ + ProgramOptions(time=True), + ProgramOptions(split_compile=True), ] - for options in options_list: - program = Program(code, "c++", options) - assert program.backend == "nvrtc" - program.compile("ptx") - program.close() - assert program.handle is None + +@pytest.mark.parametrize("options", options) +def test_ptx_program_with_various_options(init_cuda, ptx_code_object, options): + program = Program(ptx_code_object._module.decode(), "ptx", options=options) + assert program.backend == "linker" + program.compile("cubin") + program.close() + assert program.handle is None def test_program_init_valid_code_type(): @@ -70,13 +103,18 @@ def test_program_init_invalid_code_format(): # TODO: incorporate this check in Program # This is tested against the current device's arch @pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new") -def test_program_compile_valid_target_type(): +def test_program_compile_valid_target_type(init_cuda): code = 'extern "C" __global__ void my_kernel() {}' program = Program(code, "c++") - object_code = program.compile("ptx") - kernel = object_code.get_kernel("my_kernel") - assert isinstance(object_code, ObjectCode) - assert isinstance(kernel, Kernel) + ptx_object_code = program.compile("ptx") + program = Program(ptx_object_code._module.decode(), "ptx") + cubin_object_code = program.compile("cubin") + ptx_kernel = ptx_object_code.get_kernel("my_kernel") + cubin_kernel = cubin_object_code.get_kernel("my_kernel") + assert isinstance(ptx_object_code, ObjectCode) + assert isinstance(cubin_object_code, ObjectCode) + assert isinstance(ptx_kernel, Kernel) + assert isinstance(cubin_kernel, Kernel) def test_program_compile_invalid_target_type():