Skip to content

Commit

Permalink
Merge pull request #92 from InfiniTensor/dist-merge
Browse files Browse the repository at this point in the history
feat (dist): nccl通信库接入,allreduce算子
  • Loading branch information
YdrMaster authored Feb 19, 2024
2 parents e3febd9 + 16870d6 commit 34ed834
Show file tree
Hide file tree
Showing 17 changed files with 666 additions and 9 deletions.
3 changes: 3 additions & 0 deletions src/04kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ if(USE_CUDA)
# cudnn for conv and others
target_link_libraries(kernel PUBLIC cuda nvrtc cublas cublasLt cudnn kernel_cuda)
target_include_directories(kernel PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(NCCL REQUIRED)
target_link_libraries(kernel PUBLIC nccl)
endif()
if(USE_KUNLUN)
include_directories(${KUNLUN_HOME}/XTDK/include/)
Expand Down
165 changes: 165 additions & 0 deletions src/04kernel/cmake/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# From PyTorch:
#
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
# Copyright (c) 2011-2013 NYU (Clement Farabet)
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
#
# From Caffe2:
#
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
#
# All contributions by Facebook:
# Copyright (c) 2016 Facebook Inc.
#
# All contributions by Google:
# Copyright (c) 2015 Google Inc.
# All rights reserved.
#
# All contributions by Yangqing Jia:
# Copyright (c) 2015 Yangqing Jia
# All rights reserved.
#
# All contributions by Kakao Brain:
# Copyright 2019-2020 Kakao Brain
#
# All contributions from Caffe:
# Copyright(c) 2013, 2014, 2015, the respective contributors
# All rights reserved.
#
# All other contributions:
# Copyright(c) 2015, 2016 the respective contributors
# All rights reserved.
#
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
# copyright over their contributions to Caffe2. The project versioning records
# all such contribution and copyright details. If a contributor wants to further
# mark their specific copyright on a particular contribution, they should
# indicate their copyright solely in the commit message of the change when it is
# committed.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
# and IDIAP Research Institute nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
# NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")

if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})

find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})

if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()

find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)

if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)

if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
# message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})

message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()
2 changes: 2 additions & 0 deletions src/04kernel/cuda/include/kernel/cuda/functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ namespace refactor::kernel::cuda {
int currentDevice();

void sync();

void setCudaDevice(int);

void copyOut(void *dst, const void *src, size_t size);

Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/cuda/src/functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,8 @@ namespace refactor::kernel::cuda {
CUDA_ASSERT(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
}

void setCudaDevice(int id) {
cudaSetDevice(id);
}

}// namespace refactor::kernel::cuda
14 changes: 14 additions & 0 deletions src/04kernel/include/kernel/attributes/communication.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef KERNEL_COMMUNICATION_ATTRIBUTES_H
#define KERNEL_COMMUNICATION_ATTRIBUTES_H

namespace refactor::kernel {
enum class AllReduceType {
Sum,
Avg,
Min,
Max,
Prod
};
}

#endif
21 changes: 21 additions & 0 deletions src/04kernel/include/kernel/collectors/all_reduce.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef KERNEL_COLLECTOR_ALL_REDUCE_H
#define KERNEL_COLLECTOR_ALL_REDUCE_H

#include "../collector.h"
#include "kernel/attributes/communication.h"

namespace refactor::kernel {

struct AllReduceCollector final : public InfoCollector {

AllReduceType type;

constexpr AllReduceCollector(decltype(_target) target, AllReduceType type_) noexcept
: InfoCollector(target), type(type_) {}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};
}// namespace refactor::kernel

#endif
20 changes: 20 additions & 0 deletions src/04kernel/src/collectors/all_reduce.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "kernel/collectors/all_reduce.h"
#include "../kernels/all_reduce/nccl_kernel.hh"
namespace refactor::kernel {
std::vector<KernelBox>
AllReduceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
break;
case decltype(_target)::Nvidia:
if (auto ptr = AllReduceNccl::build(type, inputs[0], outputs[0]); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}
}// namespace refactor::kernel
32 changes: 32 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "nccl_kernel.hh"

namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;

K::AllReduceNccl(AllReduceType opType_, DT dataType_, size_t size_) noexcept
: opType(opType_), dataType(dataType_), size(size_) {}

auto K::build(AllReduceType opType_, Tensor const &input, Tensor const &output) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif
if (input.elementsSize() != output.elementsSize() ||
input.dataType != output.dataType) {
return nullptr;
}

return std::make_unique<K>(opType_, input.dataType, input.elementsSize());
}

auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing AllReduce using NCCL";
}

}// namespace refactor::kernel
20 changes: 20 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "nccl_kernel.hh"
#include "../../utilities/cuda/nccl_communicator.hh"
#include <nccl.h>
namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;
using namespace nccl;

auto K::lower(Resources &res) const noexcept -> RoutineWorkspace{
return [count = size,
redOp = getRedOp(opType),
ncclDataType = getNcclDataType(dataType)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
auto communicator = res.fetch<NcclCommunicator>();
auto input = inputs[0];
auto output = outputs[0];
checkNcclError(ncclAllReduce(input, output, count, ncclDataType,
redOp, communicator->get(), 0));// TODO: use default stream for now
};
}
}// namespace refactor::kernel
28 changes: 28 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef KERNEL_ALLREDUCE_NCCL_KERNEL_HH
#define KERNEL_ALLREDUCE_NCCL_KERNEL_HH

#include "kernel/collectors/all_reduce.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct AllReduceNccl final : public Kernel {
AllReduceType opType;
DataType dataType;
size_t size;

AllReduceNccl(AllReduceType, DataType, size_t) noexcept;

static KernelBox build(AllReduceType, Tensor const &, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_ALLREDUCE_NCCL_KERNEL_HH
61 changes: 61 additions & 0 deletions src/04kernel/src/utilities/cuda/nccl_communicator.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "common.h"
#include "nccl_communicator.hh"
#include <chrono>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <thread>


namespace refactor::kernel::nccl {
NcclCommunicator::NcclCommunicator(int worldSize, int rank) : worldSize_(worldSize), rank_(rank) {
const std::string filePath("./nccl_id.bin");

ncclUniqueId commId;

if (rank == 0) {
checkNcclError(ncclGetUniqueId(&commId));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *) &commId, sizeof(ncclUniqueId));

} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
ASSERT(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *) &commId, sizeof(ncclUniqueId));
}
checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));

if (rank == 0) {
std::filesystem::remove(filePath);
}

printf("Rank %d established NCCL communicator.\n", rank);
}

NcclCommunicator::~NcclCommunicator() {
checkNcclError(ncclCommFinalize(comm));
checkNcclError(ncclCommDestroy(comm));
}

auto NcclCommunicator::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}
auto NcclCommunicator::build(int worldSize, int rank) noexcept -> runtime::ResourceBox {
return std::make_unique<NcclCommunicator>(worldSize, rank);
}

auto NcclCommunicator::resourceTypeId() const noexcept -> size_t {
return typeId();
}
auto NcclCommunicator::description() const noexcept -> std::string_view {
return "NcclCommunicator";
}

}// namespace refactor::kernel::nccl
Loading

0 comments on commit 34ed834

Please sign in to comment.