feat(dist): Add NCCL communicator and allreduce kernel

InfiniTensor · Nov 13, 2023 · d5e73fe · d5e73fe
1 parent 58adc3f
commit d5e73fe
Show file tree

Hide file tree

Showing 20 changed files with 665 additions and 13 deletions.
diff --git a/src/03runtime/include/runtime/stream.h b/src/03runtime/include/runtime/stream.h
@@ -46,6 +46,7 @@ namespace refactor::runtime {
         void setInput(count_t, void const *, size_t);
         void setInput(count_t, mem_manager::SharedForeignBlob);
         void getOutput(count_t, void *, size_t) const;
+        Resources &getResources() { return _resources; };
         auto prepare() -> std::vector<count_t>;
         void run();
         auto bench(void (*sync)()) -> std::vector<std::chrono::nanoseconds>;

diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
@@ -22,6 +22,9 @@ endif()
 
 if(USE_CUDA)
     target_link_libraries(kernel PUBLIC cublas cudnn kernel_cuda)
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+    find_package(NCCL REQUIRED)
+    target_link_libraries(kernel PUBLIC nccl)
 endif()
 
 file(GLOB_RECURSE KERNEL_TEST test/*.cpp)

diff --git a/src/04kernel/cmake/FindNCCL.cmake b/src/04kernel/cmake/FindNCCL.cmake
@@ -0,0 +1,165 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# 
+# From PyTorch:
+# 
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# 
+# From Caffe2:
+# 
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+# 
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+# 
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+# 
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+# 
+# All contributions by Kakao Brain:
+# Copyright 2019-2020 Kakao Brain
+# 
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+# 
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+# 
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+# 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# 
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
+
+if ($ENV{NCCL_ROOT_DIR})
+  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
+endif()
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/src/04kernel/include/kernel/attributes/communication.h b/src/04kernel/include/kernel/attributes/communication.h
@@ -0,0 +1,14 @@
+#ifndef KERNEL_COMMUNICATION_ATTRIBUTES_H
+#define KERNEL_COMMUNICATION_ATTRIBUTES_H
+
+namespace refactor::kernel {
+    enum class AllReduceType {
+        Sum,
+        Avg,
+        Min,
+        Max,
+        Prod
+    };
+}
+
+#endif
diff --git a/src/04kernel/include/kernel/collectors/all_reduce.h b/src/04kernel/include/kernel/collectors/all_reduce.h
@@ -0,0 +1,22 @@
+#ifndef KERNEL_COLLECTOR_ALL_REDUCE_H
+#define KERNEL_COLLECTOR_ALL_REDUCE_H
+
+#include "../collector.h"
+#include "../target.h"
+#include "kernel/attributes/communication.h"
+
+namespace refactor::kernel {
+
+    struct AllReduceCollector final : public InfoCollector {
+        Target target;
+        AllReduceType type;
+
+        constexpr AllReduceCollector(Target target_, AllReduceType type_) noexcept
+            : InfoCollector(), target(target_), type(type_) {}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+}// namespace refactor::kernel
+
+#endif
diff --git a/src/04kernel/src/collectors/all_reduce.cc b/src/04kernel/src/collectors/all_reduce.cc
@@ -0,0 +1,20 @@
+#include "kernel/collectors/all_reduce.h"
+#include "../kernels/all_reduce/nccl_kernel.hh"
+namespace refactor::kernel {
+    std::vector<KernelBox>
+    AllReduceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        std::vector<KernelBox> ans;
+        switch (target) {
+            case Target::Cpu:
+                break;
+            case Target::NvidiaGpu:
+                if (auto ptr = AllReduceNccl::build(type, inputs[0], outputs[0]); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.cc b/src/04kernel/src/kernels/all_reduce/nccl_kernel.cc
@@ -0,0 +1,32 @@
+#include "nccl_kernel.hh"
+
+namespace refactor::kernel {
+    using K = AllReduceNccl;
+    using DT = DataType;
+
+    K::AllReduceNccl(AllReduceType opType_, DT dataType_, size_t size_) noexcept
+        : opType(opType_), dataType(dataType_), size(size_) {}
+
+    auto K::build(AllReduceType opType_, Tensor const &input, Tensor const &output) noexcept -> KernelBox {
+#ifndef USE_CUDA
+        return nullptr;
+#endif
+        if (input.elementsSize() != output.elementsSize() ||
+            input.dataType != output.dataType) {
+            return nullptr;
+        }
+
+        return std::make_unique<K>(opType_, input.dataType, input.elementsSize());
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing AllReduce using NCCL";
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.cu b/src/04kernel/src/kernels/all_reduce/nccl_kernel.cu
@@ -0,0 +1,20 @@
+#include "nccl_kernel.hh"
+#include "../../utilities/cuda/nccl_communicator.hh"
+#include <nccl.h>
+namespace refactor::kernel {
+    using K = AllReduceNccl;
+    using DT = DataType;
+    using namespace nccl;
+
+    Routine K::lower(Resources &res) const noexcept {
+        return [count = size,
+                redOp = getRedOp(opType),
+                ncclDataType = getNcclDataType(dataType)](Resources &res, void const **inputs, void **outputs) {
+            auto communicator = res.fetch<NcclCommunicator>();
+            auto input = inputs[0];
+            auto output = outputs[0];
+            checkNcclError(ncclAllReduce(input, output, count, ncclDataType,
+                                         redOp, communicator->get(), 0));// TODO: use default stream for now
+        };
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.hh b/src/04kernel/src/kernels/all_reduce/nccl_kernel.hh
@@ -0,0 +1,28 @@
+#ifndef KERNEL_ALLREDUCE_NCCL_KERNEL_HH
+#define KERNEL_ALLREDUCE_NCCL_KERNEL_HH
+
+#include "kernel/collectors/all_reduce.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct AllReduceNccl final : public Kernel {
+        DataType dataType;
+        AllReduceType opType;
+        size_t size;
+
+        AllReduceNccl(AllReduceType, DataType, size_t) noexcept;
+
+        static KernelBox build(AllReduceType, Tensor const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        Routine lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_ALLREDUCE_NCCL_KERNEL_HH
diff --git a/src/04kernel/src/target.cc b/src/04kernel/src/target.cc
@@ -13,7 +13,7 @@ namespace refactor::kernel {
                 class BasicCpuMemManager final : public mem_manager::MemManager {
                 public:
                     static Arc<mem_manager::MemManager> instance() {
-                        static auto I = std::make_shared<BasicCpuMemManager>();
+                        auto I = std::make_shared<BasicCpuMemManager>();
                         return I;
                     }
                     void *malloc(size_t bytes) noexcept final {
@@ -32,12 +32,12 @@ namespace refactor::kernel {
                         return std::memcpy(dst, src, bytes);
                     }
                 };
-                static Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(4ul << 30, sizeof(uint64_t), BasicCpuMemManager::instance());
+                static thread_local Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(10ul << 30, sizeof(uint64_t), BasicCpuMemManager::instance());
                 return memPool;
             }
 #ifdef USE_CUDA
             case NvidiaGpu: {
-                static Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(4ul << 30, 256, cuda::BasicCudaMemManager::instance());
+                static thread_local Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(10ul << 30, 256, cuda::BasicCudaMemManager::instance());
                 return memPool;
             }
 #endif