From 21e0f0ead374c7718974019a3ef9d9cdc2eb03d9 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 24 Jan 2025 01:22:24 -0500
Subject: [PATCH] Add cuda 12.8 support (#605)

CUDA 12.8 introduces sm_120 that requires a reduced number of threads per sm.

We also need to pass -static-global-template-stub=false when building with 12.8 as we violate CUDA ODR kernel rules

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cuvs/pull/605
---
 cpp/cmake/modules/ConfigureCUDA.cmake   | 7 ++++++-
 cpp/src/neighbors/detail/nn_descent.cuh | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 3e91d9995..e99a53049 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,6 +35,11 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   endif()
 endif()
 
+# Allow invalid CUDA kernels in the short term
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+  list(APPEND CUVS_CUDA_FLAGS -static-global-template-stub=false)
+endif()
+
 if(CUDA_LOG_COMPILE_TIME)
   list(APPEND CUVS_CUDA_FLAGS "--time=nvcc_compile_log.csv")
 endif()
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
index 4c577a4d2..310d4e7a6 100644
--- a/cpp/src/neighbors/detail/nn_descent.cuh
+++ b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -706,7 +706,8 @@ __device__ __forceinline__ void remove_duplicates(
 template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
-#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
+#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890) || \
+  (__CUDA_ARCH__) == 1200
 __launch_bounds__(BLOCK_SIZE)
 #else
 __launch_bounds__(BLOCK_SIZE, 4)