Skip to content

Commit

Permalink
feat(dist): Add NCCL communicator and allreduce kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
PanZezhong1725 committed Nov 13, 2023
1 parent 58adc3f commit d5e73fe
Show file tree
Hide file tree
Showing 20 changed files with 665 additions and 13 deletions.
1 change: 1 addition & 0 deletions src/03runtime/include/runtime/stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ namespace refactor::runtime {
void setInput(count_t, void const *, size_t);
void setInput(count_t, mem_manager::SharedForeignBlob);
void getOutput(count_t, void *, size_t) const;
Resources &getResources() { return _resources; };
auto prepare() -> std::vector<count_t>;
void run();
auto bench(void (*sync)()) -> std::vector<std::chrono::nanoseconds>;
Expand Down
3 changes: 3 additions & 0 deletions src/04kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ endif()

if(USE_CUDA)
target_link_libraries(kernel PUBLIC cublas cudnn kernel_cuda)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(NCCL REQUIRED)
target_link_libraries(kernel PUBLIC nccl)
endif()

file(GLOB_RECURSE KERNEL_TEST test/*.cpp)
Expand Down
165 changes: 165 additions & 0 deletions src/04kernel/cmake/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# From PyTorch:
#
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
# Copyright (c) 2011-2013 NYU (Clement Farabet)
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
#
# From Caffe2:
#
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
#
# All contributions by Facebook:
# Copyright (c) 2016 Facebook Inc.
#
# All contributions by Google:
# Copyright (c) 2015 Google Inc.
# All rights reserved.
#
# All contributions by Yangqing Jia:
# Copyright (c) 2015 Yangqing Jia
# All rights reserved.
#
# All contributions by Kakao Brain:
# Copyright 2019-2020 Kakao Brain
#
# All contributions from Caffe:
# Copyright(c) 2013, 2014, 2015, the respective contributors
# All rights reserved.
#
# All other contributions:
# Copyright(c) 2015, 2016 the respective contributors
# All rights reserved.
#
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
# copyright over their contributions to Caffe2. The project versioning records
# all such contribution and copyright details. If a contributor wants to further
# mark their specific copyright on a particular contribution, they should
# indicate their copyright solely in the commit message of the change when it is
# committed.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
# and IDIAP Research Institute nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
# NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")

if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})

find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})

if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()

find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)

if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)

if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
# message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})

message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()
14 changes: 14 additions & 0 deletions src/04kernel/include/kernel/attributes/communication.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef KERNEL_COMMUNICATION_ATTRIBUTES_H
#define KERNEL_COMMUNICATION_ATTRIBUTES_H

namespace refactor::kernel {
enum class AllReduceType {
Sum,
Avg,
Min,
Max,
Prod
};
}

#endif
22 changes: 22 additions & 0 deletions src/04kernel/include/kernel/collectors/all_reduce.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef KERNEL_COLLECTOR_ALL_REDUCE_H
#define KERNEL_COLLECTOR_ALL_REDUCE_H

#include "../collector.h"
#include "../target.h"
#include "kernel/attributes/communication.h"

namespace refactor::kernel {

struct AllReduceCollector final : public InfoCollector {
Target target;
AllReduceType type;

constexpr AllReduceCollector(Target target_, AllReduceType type_) noexcept
: InfoCollector(), target(target_), type(type_) {}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};
}// namespace refactor::kernel

#endif
20 changes: 20 additions & 0 deletions src/04kernel/src/collectors/all_reduce.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "kernel/collectors/all_reduce.h"
#include "../kernels/all_reduce/nccl_kernel.hh"
namespace refactor::kernel {
std::vector<KernelBox>
AllReduceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
std::vector<KernelBox> ans;
switch (target) {
case Target::Cpu:
break;
case Target::NvidiaGpu:
if (auto ptr = AllReduceNccl::build(type, inputs[0], outputs[0]); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}
}// namespace refactor::kernel
32 changes: 32 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "nccl_kernel.hh"

namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;

K::AllReduceNccl(AllReduceType opType_, DT dataType_, size_t size_) noexcept
: opType(opType_), dataType(dataType_), size(size_) {}

auto K::build(AllReduceType opType_, Tensor const &input, Tensor const &output) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif
if (input.elementsSize() != output.elementsSize() ||
input.dataType != output.dataType) {
return nullptr;
}

return std::make_unique<K>(opType_, input.dataType, input.elementsSize());
}

auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing AllReduce using NCCL";
}

}// namespace refactor::kernel
20 changes: 20 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "nccl_kernel.hh"
#include "../../utilities/cuda/nccl_communicator.hh"
#include <nccl.h>
namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;
using namespace nccl;

Routine K::lower(Resources &res) const noexcept {
return [count = size,
redOp = getRedOp(opType),
ncclDataType = getNcclDataType(dataType)](Resources &res, void const **inputs, void **outputs) {
auto communicator = res.fetch<NcclCommunicator>();
auto input = inputs[0];
auto output = outputs[0];
checkNcclError(ncclAllReduce(input, output, count, ncclDataType,
redOp, communicator->get(), 0));// TODO: use default stream for now
};
}
}// namespace refactor::kernel
28 changes: 28 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef KERNEL_ALLREDUCE_NCCL_KERNEL_HH
#define KERNEL_ALLREDUCE_NCCL_KERNEL_HH

#include "kernel/collectors/all_reduce.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct AllReduceNccl final : public Kernel {
DataType dataType;
AllReduceType opType;
size_t size;

AllReduceNccl(AllReduceType, DataType, size_t) noexcept;

static KernelBox build(AllReduceType, Tensor const &, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
Routine lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_ALLREDUCE_NCCL_KERNEL_HH
6 changes: 3 additions & 3 deletions src/04kernel/src/target.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ namespace refactor::kernel {
class BasicCpuMemManager final : public mem_manager::MemManager {
public:
static Arc<mem_manager::MemManager> instance() {
static auto I = std::make_shared<BasicCpuMemManager>();
auto I = std::make_shared<BasicCpuMemManager>();
return I;
}
void *malloc(size_t bytes) noexcept final {
Expand All @@ -32,12 +32,12 @@ namespace refactor::kernel {
return std::memcpy(dst, src, bytes);
}
};
static Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(4ul << 30, sizeof(uint64_t), BasicCpuMemManager::instance());
static thread_local Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(10ul << 30, sizeof(uint64_t), BasicCpuMemManager::instance());
return memPool;
}
#ifdef USE_CUDA
case NvidiaGpu: {
static Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(4ul << 30, 256, cuda::BasicCudaMemManager::instance());
static thread_local Arc<mem_manager::MemManager> memPool = std::make_shared<mem_manager::MemPool>(10ul << 30, 256, cuda::BasicCudaMemManager::instance());
return memPool;
}
#endif
Expand Down
Loading

0 comments on commit d5e73fe

Please sign in to comment.