CUTLASS 3.0.0 (#786)

* CUTLASS 3.0.0
NVIDIA · Jan 24, 2023 · 277bd6e · 277bd6e
1 parent 66d9cdd
commit 277bd6e
Show file tree

Hide file tree

Showing 377 changed files with 76,395 additions and 1,185 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # NVIDIA CUTLASS Changelog
 
+
+## [3.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.0.0) (2023-01-23)
+* [CuTe](/media/docs/cute/00_quickstart.md), a [new core library and backend](/include/cute) for CUTLASS 3.0 that defines a single Layout vocabulary type and an associated algebra of layouts for a much more expressive and composable abstraction for tensors, sets of parallel agents, and operations by said agents on tensors.
+* [A new conceptual operation hierarchy](media/docs/cutlass_3x_design.md) that replaces the architecture-centric hierarchy of CUTLASS 2.x and [documentation for CUTLASS 3.0's GEMM API changes](/media/docs/gemm_api_3x.md).
+* Strict API backwards compatibility that exposes both 2.x and 3.x API kernels through the same [`device::GemmUniversalAdapter`](include/cutlass/gemm/device/gemm_universal_adapter.h) and [`kernel::GemmUniversal`](include/cutlass/gemm/kernel/gemm_universal.hpp) types, allowing users to include both APIs in the same translation units. More information can be found in the [3.x backwards compatibility section](media/docs/cutlass_3x_backwards_compatibility.md).
+* Updates to [Functionality](media/docs/functionality.md) which directs users on which kernels are supported via CUTLASS-2 and CUTLASS-3.
+* Updates to [Compatibility](/README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](/README.md#Target-Architecture).
+* New warp-specialized GEMM [kernel schedules](include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters.
+* Extensions to CUTLASS profiler to support threadblock cluster shapes in library and profiler tile configurations.
+* [CUTLASS library integration](/tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler.
+* Support for [Hopper GEMMs](examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features.
+* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](examples/48_hopper_warp_specialized_gemm), [49](examples/49_hopper_gemm_schedules_with_collective_builder), and [50](examples/50_hopper_gemm_with_epilogue_swizzle).
+
 ## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19)
 * [Stream-K](/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K.  It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one.
 * [Fused multi-head attention Kernel](/examples/41_fused_multi_head_attention).  It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length.  Both versions just need one kernel.

diff --git a/CITATION.cff b/CITATION.cff
@@ -5,33 +5,61 @@ message: >-
   following metadata.
 type: software
 authors:
-  - given-names: Andrew
-    email: [email protected]
-    family-names: Kerr
+  - given-names: Vijay
+    family-names: Thakkar
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Pradeep
+    family-names: Ramani
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Cris
+    family-names: Cecka
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Aniket
+    family-names: Shivam
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Honghao
+    family-names: Lu
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Ethan
+    family-names: Yan
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Jack
+    family-names: Kosaian
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Mark
+    family-names: Hoemmen
+    email: [email protected]
     affiliation: NVIDIA
   - given-names: Haicheng
     family-names: Wu
-    affiliation: NVIDIA
     email: [email protected]
-  - given-names: Manish
-    family-names: Gupta
-    affiliation: Google
-    email: [email protected]
-  - given-names: Dustyn
-    family-names: Blasig
-    email: [email protected]
     affiliation: NVIDIA
-  - given-names: Pradeep
-    family-names: Ramini
-    email: [email protected]
+  - given-names: Andrew
+    family-names: Kerr
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Matt
+    family-names: Nicely
+    email: [email protected]
     affiliation: NVIDIA
   - given-names: Duane
     family-names: Merrill
     email: [email protected]
     affiliation: NVIDIA
-  - given-names: Aniket
-    family-names: Shivam
-    email: [email protected]
+  - given-names: Dustyn
+    family-names: Blasig
+    email: [email protected]
+    affiliation: NVIDIA
+  - given-names: Fengqi
+    family-names: Qiao
+    email: [email protected]
     affiliation: NVIDIA
   - given-names: Piotr
     family-names: Majcher
@@ -49,10 +77,12 @@ authors:
     family-names: Wang
     email: [email protected]
     affiliation: NVIDIA
-  - given-names: Matt
-    family-names: Nicely
-    email: [email protected]
-    affiliation: NVIDIA
+  - given-names: Manish
+    family-names: Gupta
+    affiliation: Google
+    email: [email protected]
+
+
 repository-code: 'https://github.com/NVIDIA/cutlass'
 abstract: >-
   CUTLASS is a collection of CUDA C++ template
@@ -71,12 +101,12 @@ abstract: >-
   flexibility simplifies their use as building blocks
   within custom kernels and applications.
 keywords:
-  - 'cutlass, tensor cores, cuda'
+  - 'cutlass, tensor cores, cuda, cute, nvidia, gpu, linear algebra, matrix computations'
 license: BSD-3-Clause
-license-url: https://github.com/NVIDIA/cutlass/blob/v2.11.0/LICENSE.txt
-version: '2.11.0'
-date-released: '2022-11-19'
+license-url: https://github.com/NVIDIA/cutlass/blob/v3.0.0/LICENSE.txt
+version: '3.0.0'
+date-released: '2023-01-23'
 identifiers:
   - type: url
-    value: "https://github.com/NVIDIA/cutlass/tree/v2.11.0"
-    description: The GitHub release URL of tag 2.11.0
+    value: "https://github.com/NVIDIA/cutlass/tree/v3.0.0"
+    description: The GitHub release URL of tag 3.0.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,7 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-cmake_minimum_required(VERSION 3.12.4 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 if(cutlass_LOADED)
   # If CUTLASS has been previously fetched and loaded, don't do it again.
@@ -39,35 +39,40 @@ endif()
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 set(IMPLICIT_CMAKE_CXX_STANDARD OFF CACHE BOOL "Do not explicitly specify -std=c++11 if set")
 
-project(CUTLASS VERSION 2.11.0 LANGUAGES CXX)
+project(CUTLASS VERSION 3.0.0 LANGUAGES CXX)
 include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake)
 
-if (CUDA_VERSION VERSION_LESS 10.2)
-  message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 10.2 or higher, and strongly recommends CUDA 11.0 or higher.")
-elseif (CUDA_VERSION VERSION_LESS 11.0)
-  message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.0 or higher.")
+if (CUDA_VERSION VERSION_LESS 11.3)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} requires CUDA 11.4 or higher, and strongly recommends CUDA 11.8 or higher.")
+elseif (CUDA_VERSION VERSION_LESS 11.4)
+  message(WARNING "CUTLASS ${CUTLASS_VERSION} support for CUDA ${CUDA_VERSION} is deprecated, please use CUDA 11.8 or higher.")
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.5)
+  message(FATAL_ERROR "GCC version must be at least 7.5!")
+endif()
+
+if (CUDA_COMPILER MATCHES "[Cc]lang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
+  message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
 endif()
 
 find_package(Doxygen QUIET)
 
 #
-# CUTLASS 2.x requires C++11
+# CUTLASS 3.x requires C++17
 #
-if (NOT IMPLICIT_CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 11)
-  set(CMAKE_CXX_STANDARD_REQUIRED ON)
-  set(CMAKE_CXX_EXTENSIONS OFF)
-endif()
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
 
 if(CUTLASS_NATIVE_CUDA)
-  set(CMAKE_CUDA_STANDARD 11)
+  set(CMAKE_CUDA_STANDARD 17)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
 else()
-  if (NOT IMPLICIT_CMAKE_CXX_STANDARD)
-    list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++11)
-  endif()
+  list(APPEND CUTLASS_CUDA_NVCC_FLAGS --std=c++17)
 endif()
-
+  
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX install CACHE PATH "Default installation location." FORCE)
 endif()
@@ -107,29 +112,14 @@ if (CUTLASS_ENABLE_TESTS)
 endif()
 
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
-if (NOT CUDA_VERSION VERSION_LESS 7.5)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 8.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 9.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
 endif()
-if (NOT CUDA_VERSION VERSION_LESS 9.2)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 72)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 89 90)
 endif()
-if (NOT CUDA_VERSION VERSION_LESS 10.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.0)
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.1 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 86)
-endif()
-if (NOT CUDA_VERSION VERSION_LESS 11.8 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
-  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90)
+if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
+  list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 90a)
 endif()
 set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.")
 set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.")
@@ -271,6 +261,7 @@ if (CUTLASS_ENABLE_TENSOR_CORE_MMA)
   list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
 endif()
 
+
 if (NOT MSVC AND CUTLASS_NVCC_KEEP)
   # MSVC flow handles caching already, but for other generators we handle it here.
   set(CUTLASS_NVCC_KEEP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tmp CACHE PATH "Location to store NVCC scratch files")
@@ -288,6 +279,15 @@ if (CUTLASS_ENABLE_F16C AND NOT CMAKE_CROSSCOMPILING)
   endif()
 endif()
 
+if (CUTLASS_ENABLE_OPENMP_TESTS)
+  find_package(OpenMP)
+  if(OpenMP_CXX_FOUND)
+    list(APPEND CUTLASS_CUDA_NVCC_FLAGS -Xcompiler=${OpenMP_CXX_FLAGS})
+  else()
+    message(WARNING "CUTLASS_ENABLE_OPENMP_TESTS set but OpenMP not found.")
+  endif()
+endif()
+
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$<BOOL:${UNIX}>:-Xcompiler=-Wconversion>)
 list(APPEND CUTLASS_CUDA_NVCC_FLAGS $<$<BOOL:${UNIX}>:-Xcompiler=-fno-strict-aliasing>)
 
@@ -313,10 +313,6 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
     message(FATAL_ERROR "Clang CUDA compilation requires Clang CXX compilation. Currently CMAKE_CXX_COMPILER is ${CMAKE_CXX_COMPILER_ID}" )
   endif()
 
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0)
-    message(FATAL_ERROR "Clang 7.0+ required for GPU compilation")
-  endif()
-
   # There are numerous Clang versions that can work with each CUDA toolkit and the 
   # the checks are not very useful so we are turning them off and using testing to 
   # ensure the various combinations work properly.
@@ -341,6 +337,7 @@ if(CUDA_COMPILER MATCHES "[Cc]lang")
   list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wl,--disable-new-dtags)
 
   link_libraries(nvidia::cudart)
+  link_libraries(nvidia::cuda_driver)
 endif()
 
 # Support for 128-bit integers if using NVIDIA C++ compiler 
@@ -530,6 +527,8 @@ target_include_directories(
   $<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
   $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
   $<BUILD_INTERFACE:${CUDA_TOOLKIT_ROOT_DIR}/include>
+  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
+  $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
   )
 
 install(