InfiniTensor · YdrMaster · Feb 19, 2024 · Feb 19, 2024
diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
@@ -28,6 +28,9 @@ if(USE_CUDA)
     # cudnn  for conv and others
     target_link_libraries(kernel PUBLIC cuda nvrtc cublas cublasLt cudnn kernel_cuda)
     target_include_directories(kernel PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
+    find_package(NCCL REQUIRED)
+    target_link_libraries(kernel PUBLIC nccl)
 endif()
 if(USE_KUNLUN)
     include_directories(${KUNLUN_HOME}/XTDK/include/)

diff --git a/src/04kernel/cmake/FindNCCL.cmake b/src/04kernel/cmake/FindNCCL.cmake
@@ -0,0 +1,165 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+# 
+# From PyTorch:
+# 
+# Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+# Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+# Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+# Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+# Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+# 
+# From Caffe2:
+# 
+# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+# 
+# All contributions by Facebook:
+# Copyright (c) 2016 Facebook Inc.
+# 
+# All contributions by Google:
+# Copyright (c) 2015 Google Inc.
+# All rights reserved.
+# 
+# All contributions by Yangqing Jia:
+# Copyright (c) 2015 Yangqing Jia
+# All rights reserved.
+# 
+# All contributions by Kakao Brain:
+# Copyright 2019-2020 Kakao Brain
+# 
+# All contributions from Caffe:
+# Copyright(c) 2013, 2014, 2015, the respective contributors
+# All rights reserved.
+# 
+# All other contributions:
+# Copyright(c) 2015, 2016 the respective contributors
+# All rights reserved.
+# 
+# Caffe2 uses a copyright model similar to Caffe: each contributor holds
+# copyright over their contributions to Caffe2. The project versioning records
+# all such contribution and copyright details. If a contributor wants to further
+# mark their specific copyright on a particular contribution, they should
+# indicate their copyright solely in the commit message of the change when it is
+# committed.
+# 
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 
+# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+#    and IDIAP Research Institute nor the names of its contributors may be
+#    used to endorse or promote products derived from this software without
+#    specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# 
+# Find the nccl libraries
+#
+# The following variables are optionally searched for defaults
+#  NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
+#  NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
+#  NCCL_LIB_DIR: Directory where NCCL library is found
+#
+# The following are set after configuration is done:
+#  NCCL_FOUND
+#  NCCL_INCLUDE_DIRS
+#  NCCL_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NCCL in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")
+
+if ($ENV{NCCL_ROOT_DIR})
+  message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
+endif()
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/src/04kernel/cuda/include/kernel/cuda/functions.cuh b/src/04kernel/cuda/include/kernel/cuda/functions.cuh
@@ -6,6 +6,8 @@ namespace refactor::kernel::cuda {
     int currentDevice();
 
     void sync();
+
+    void setCudaDevice(int);
 
     void copyOut(void *dst, const void *src, size_t size);
 

diff --git a/src/04kernel/cuda/src/functions.cu b/src/04kernel/cuda/src/functions.cu
@@ -19,4 +19,8 @@ namespace refactor::kernel::cuda {
         CUDA_ASSERT(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
     }
 
+    void setCudaDevice(int id) {
+        cudaSetDevice(id);
+    }
+
 }// namespace refactor::kernel::cuda
diff --git a/src/04kernel/include/kernel/attributes/communication.h b/src/04kernel/include/kernel/attributes/communication.h
@@ -0,0 +1,14 @@
+#ifndef KERNEL_COMMUNICATION_ATTRIBUTES_H
+#define KERNEL_COMMUNICATION_ATTRIBUTES_H
+
+namespace refactor::kernel {
+    enum class AllReduceType {
+        Sum,
+        Avg,
+        Min,
+        Max,
+        Prod
+    };
+}
+
+#endif
diff --git a/src/04kernel/include/kernel/collectors/all_reduce.h b/src/04kernel/include/kernel/collectors/all_reduce.h
@@ -0,0 +1,21 @@
+#ifndef KERNEL_COLLECTOR_ALL_REDUCE_H
+#define KERNEL_COLLECTOR_ALL_REDUCE_H
+
+#include "../collector.h"
+#include "kernel/attributes/communication.h"
+
+namespace refactor::kernel {
+
+    struct AllReduceCollector final : public InfoCollector {
+
+        AllReduceType type;
+
+        constexpr AllReduceCollector(decltype(_target) target, AllReduceType type_) noexcept
+            : InfoCollector(target), type(type_) {}
+
+        std::vector<KernelBox>
+        filter(TensorRefs inputs, TensorRefs outputs) const final;
+    };
+}// namespace refactor::kernel
+
+#endif
diff --git a/src/04kernel/src/collectors/all_reduce.cc b/src/04kernel/src/collectors/all_reduce.cc
@@ -0,0 +1,20 @@
+#include "kernel/collectors/all_reduce.h"
+#include "../kernels/all_reduce/nccl_kernel.hh"
+namespace refactor::kernel {
+    std::vector<KernelBox>
+    AllReduceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
+        std::vector<KernelBox> ans;
+        switch (_target) {
+            case decltype(_target)::Cpu:
+                break;
+            case decltype(_target)::Nvidia:
+                if (auto ptr = AllReduceNccl::build(type, inputs[0], outputs[0]); ptr) {
+                    ans.emplace_back(std::move(ptr));
+                }
+                break;
+            default:
+                UNREACHABLEX(void, "Unknown target");
+        }
+        return ans;
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.cc b/src/04kernel/src/kernels/all_reduce/nccl_kernel.cc
@@ -0,0 +1,32 @@
+#include "nccl_kernel.hh"
+
+namespace refactor::kernel {
+    using K = AllReduceNccl;
+    using DT = DataType;
+
+    K::AllReduceNccl(AllReduceType opType_, DT dataType_, size_t size_) noexcept
+        : opType(opType_), dataType(dataType_), size(size_) {}
+
+    auto K::build(AllReduceType opType_, Tensor const &input, Tensor const &output) noexcept -> KernelBox {
+#ifndef USE_CUDA
+        return nullptr;
+#endif
+        if (input.elementsSize() != output.elementsSize() ||
+            input.dataType != output.dataType) {
+            return nullptr;
+        }
+
+        return std::make_unique<K>(opType_, input.dataType, input.elementsSize());
+    }
+
+    auto K::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+
+    auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
+    auto K::description() const noexcept -> std::string_view {
+        return "Performing AllReduce using NCCL";
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.cu b/src/04kernel/src/kernels/all_reduce/nccl_kernel.cu
@@ -0,0 +1,20 @@
+#include "nccl_kernel.hh"
+#include "../../utilities/cuda/nccl_communicator.hh"
+#include <nccl.h>
+namespace refactor::kernel {
+    using K = AllReduceNccl;
+    using DT = DataType;
+    using namespace nccl;
+
+    auto K::lower(Resources &res) const noexcept -> RoutineWorkspace{
+        return [count = size,
+                redOp = getRedOp(opType),
+                ncclDataType = getNcclDataType(dataType)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
+            auto communicator = res.fetch<NcclCommunicator>();
+            auto input = inputs[0];
+            auto output = outputs[0];
+            checkNcclError(ncclAllReduce(input, output, count, ncclDataType,
+                                         redOp, communicator->get(), 0));// TODO: use default stream for now
+        };
+    }
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/kernels/all_reduce/nccl_kernel.hh b/src/04kernel/src/kernels/all_reduce/nccl_kernel.hh
@@ -0,0 +1,28 @@
+#ifndef KERNEL_ALLREDUCE_NCCL_KERNEL_HH
+#define KERNEL_ALLREDUCE_NCCL_KERNEL_HH
+
+#include "kernel/collectors/all_reduce.h"
+#include "kernel/tensor.h"
+
+namespace refactor::kernel {
+
+    struct AllReduceNccl final : public Kernel {
+        AllReduceType opType;
+        DataType dataType;
+        size_t size;
+
+        AllReduceNccl(AllReduceType, DataType, size_t) noexcept;
+
+        static KernelBox build(AllReduceType, Tensor const &, Tensor const &) noexcept;
+        static size_t typeId() noexcept;
+
+        size_t kernelTypeId() const noexcept final;
+        std::string_view description() const noexcept final;
+#ifdef USE_CUDA
+        RoutineWorkspace lower(Resources &) const noexcept final;
+#endif
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_ALLREDUCE_NCCL_KERNEL_HH
diff --git a/src/04kernel/src/utilities/cuda/nccl_communicator.cu b/src/04kernel/src/utilities/cuda/nccl_communicator.cu
@@ -0,0 +1,61 @@
+#include "common.h"
+#include "nccl_communicator.hh"
+#include <chrono>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <thread>
+
+
+namespace refactor::kernel::nccl {
+    NcclCommunicator::NcclCommunicator(int worldSize, int rank) : worldSize_(worldSize), rank_(rank) {
+        const std::string filePath("./nccl_id.bin");
+
+        ncclUniqueId commId;
+
+        if (rank == 0) {
+            checkNcclError(ncclGetUniqueId(&commId));
+            std::ofstream ofs(filePath, std::ios::binary);
+            ofs.write((char *) &commId, sizeof(ncclUniqueId));
+
+        } else {
+            auto begin = std::chrono::steady_clock::now();
+            while (!std::filesystem::exists(filePath)) {
+                auto now = std::chrono::steady_clock::now();
+                ASSERT(now < begin + std::chrono::seconds(10),
+                       "time limit (10s) exceeded.");
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            }
+            std::ifstream ifs(filePath, std::ios::binary);
+            ifs.read((char *) &commId, sizeof(ncclUniqueId));
+        }
+        checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));
+
+        if (rank == 0) {
+            std::filesystem::remove(filePath);
+        }
+
+        printf("Rank %d established NCCL communicator.\n", rank);
+    }
+
+    NcclCommunicator::~NcclCommunicator() {
+        checkNcclError(ncclCommFinalize(comm));
+        checkNcclError(ncclCommDestroy(comm));
+    }
+
+    auto NcclCommunicator::typeId() noexcept -> size_t {
+        static uint8_t ID = 1;
+        return reinterpret_cast<size_t>(&ID);
+    }
+    auto NcclCommunicator::build(int worldSize, int rank) noexcept -> runtime::ResourceBox {
+        return std::make_unique<NcclCommunicator>(worldSize, rank);
+    }
+
+    auto NcclCommunicator::resourceTypeId() const noexcept -> size_t {
+        return typeId();
+    }
+    auto NcclCommunicator::description() const noexcept -> std::string_view {
+        return "NcclCommunicator";
+    }
+
+}// namespace refactor::kernel::nccl