diff --git a/3rd-party/cccl b/3rd-party/cccl
index b7d4228a..d27b5896 160000
--- a/3rd-party/cccl
+++ b/3rd-party/cccl
@@ -1 +1 @@
-Subproject commit b7d4228ab7268ed928984cd61096079bd671d25d
+Subproject commit d27b58963128f17a6c2f3f867301d54e9f4b48cd
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49ddcda6..2d76dd81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,12 +12,20 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+# Download with:
+#
+# mkdir -p cmake
+# wget -O cmake/CPM.cmake https://github.com/cpm-cmake/CPM.cmake/releases/latest/download/get_cpm.cmake
+include(cmake/CPM.cmake)
+
 if(USE_CUDA)
+    CPMAddPackage(NAME CCCL SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rd-party/cccl)
+
     add_compile_definitions(USE_CUDA)
     enable_language(CUDA)
     set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
     if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(CMAKE_CUDA_ARCHITECTURES 80)
+        set(CMAKE_CUDA_ARCHITECTURES native)
     endif()
     if(NOT DEFINED CMAKE_CUDA_STANDARD)
         set(CMAKE_CUDA_STANDARD 17)
@@ -45,7 +53,7 @@ endif()
 if (USE_BANG)
     add_compile_definitions(USE_BANG)
     include_directories(src/kernels/mlu/include)
-    
+
     # Neuware Evironment
     if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME}))
         message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env")
@@ -55,14 +63,14 @@ if (USE_BANG)
         set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
     endif()
     message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}")
-    
+
     # cnrt cndrv cnnl
     include_directories("${NEUWARE_HOME}/include")
     find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
     find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
     find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall")
-  
+
     if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
         execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
         set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
new file mode 100644
index 00000000..cc25ec28
--- /dev/null
+++ b/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.38.7)
+set(CPM_HASH_SUM "83e5eb71b2bbb8b1f2ad38f1950287a057624e385c238f6087f94cdfc44af9c5")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
index b75fff17..1a867c02 100644
--- a/src/04kernel/CMakeLists.txt
+++ b/src/04kernel/CMakeLists.txt
@@ -26,7 +26,8 @@ if(USE_CUDA)
     # nvrtc  for cuda kernel compile
     # cublas for matmul
     # cudnn  for conv and others
-    target_link_libraries(kernel PUBLIC cuda nvrtc cublas cublasLt cudnn kernel_cuda)
+    target_link_libraries(kernel PUBLIC cuda kernel_cuda)
+    target_link_libraries(kernel PRIVATE nvrtc cublas cublasLt cudnn)
     target_include_directories(kernel PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 endif()
 if(USE_KUNLUN)
diff --git a/src/04kernel/cuda/CMakeLists.txt b/src/04kernel/cuda/CMakeLists.txt
index 4c976e33..07223090 100644
--- a/src/04kernel/cuda/CMakeLists.txt
+++ b/src/04kernel/cuda/CMakeLists.txt
@@ -4,7 +4,7 @@ project(kernel_cuda)
 file(GLOB_RECURSE KERNEL_CUDA_SUB_SRC src/*.cu)
 
 add_library(kernel_cuda STATIC ${KERNEL_CUDA_SUB_SRC})
-target_link_libraries(kernel_cuda PUBLIC common)
+target_link_libraries(kernel_cuda PUBLIC common CCCL::CCCL)
 target_include_directories(kernel_cuda PUBLIC include)
 
 file(GLOB_RECURSE KERNEL_CUDA_TEST test/*.cu)
diff --git a/src/04kernel/cuda/include/kernel/cuda/reduce.cuh b/src/04kernel/cuda/include/kernel/cuda/reduce.cuh
new file mode 100644
index 00000000..42739534
--- /dev/null
+++ b/src/04kernel/cuda/include/kernel/cuda/reduce.cuh
@@ -0,0 +1,9 @@
+#ifndef KERNEL_CUDA_REDUCE_CUH
+#define KERNEL_CUDA_REDUCE_CUH
+
+#include <cub/warp/warp_reduce.cuh>
+
+namespace refactor::kernel::cuda {
+}
+
+#endif// KERNEL_CUDA_REDUCE_CUH
diff --git a/src/04kernel/src/kernels/attention/cuda_kernel.cu b/src/04kernel/src/kernels/attention/cuda_kernel.cu
index a0f3f56a..79aa6f2b 100644
--- a/src/04kernel/src/kernels/attention/cuda_kernel.cu
+++ b/src/04kernel/src/kernels/attention/cuda_kernel.cu
@@ -1,6 +1,7 @@
 #include "../../utilities/cuda/cublaslt_utils.cuh"
 #include "cuda_kernel.hh"
 #include "hardware/functions.h"
+#include "kernel/cuda/reduce.cuh"
 
 namespace refactor::kernel {
     using K = AttentionCuda;