Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat (dist): nccl通信库接入,allreduce算子 #92

Merged
merged 1 commit into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/04kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ if(USE_CUDA)
# cudnn for conv and others
target_link_libraries(kernel PUBLIC cuda nvrtc cublas cublasLt cudnn kernel_cuda)
target_include_directories(kernel PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
find_package(NCCL REQUIRED)
target_link_libraries(kernel PUBLIC nccl)
endif()
if(USE_KUNLUN)
include_directories(${KUNLUN_HOME}/XTDK/include/)
Expand Down
165 changes: 165 additions & 0 deletions src/04kernel/cmake/FindNCCL.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# From PyTorch:
#
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
# Copyright (c) 2011-2013 NYU (Clement Farabet)
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
#
# From Caffe2:
#
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
#
# All contributions by Facebook:
# Copyright (c) 2016 Facebook Inc.
#
# All contributions by Google:
# Copyright (c) 2015 Google Inc.
# All rights reserved.
#
# All contributions by Yangqing Jia:
# Copyright (c) 2015 Yangqing Jia
# All rights reserved.
#
# All contributions by Kakao Brain:
# Copyright 2019-2020 Kakao Brain
#
# All contributions from Caffe:
# Copyright(c) 2013, 2014, 2015, the respective contributors
# All rights reserved.
#
# All other contributions:
# Copyright(c) 2015, 2016 the respective contributors
# All rights reserved.
#
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
# copyright over their contributions to Caffe2. The project versioning records
# all such contribution and copyright details. If a contributor wants to further
# mark their specific copyright on a particular contribution, they should
# indicate their copyright solely in the commit message of the change when it is
# committed.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
# and IDIAP Research Institute nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# Find the nccl libraries
#
# The following variables are optionally searched for defaults
# NCCL_ROOT: Base directory where all NCCL components are foundHong Xu, 1 year ago: • Let CMake handle NCCL detection instead of ou…
# NCCL_INCLUDE_DIR: Directory where NCCL header is foundPieter Noordhuis, 3 years ago: • Bump gloo
# NCCL_LIB_DIR: Directory where NCCL library is found
#
# The following are set after configuration is done:
# NCCL_FOUND
# NCCL_INCLUDE_DIRS
# NCCL_LIBRARIES
#
# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
# install NCCL in the same location as the CUDA toolkit.
# See https://github.com/caffe2/caffe2/issues/1601

set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
set(NCCL_VERSION $ENV{NCCL_VERSION} CACHE STRING "Version of NCCL to build with")

if ($ENV{NCCL_ROOT_DIR})
message(WARNING "NCCL_ROOT_DIR is deprecated. Please set NCCL_ROOT instead.")
endif()
list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})

find_path(NCCL_INCLUDE_DIRS
NAMES nccl.h
HINTS ${NCCL_INCLUDE_DIR})

if (USE_STATIC_NCCL)
MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
SET(NCCL_LIBNAME "nccl_static")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
else()
SET(NCCL_LIBNAME "nccl")
if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified
set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
endif()
endif()

find_library(NCCL_LIBRARIES
NAMES ${NCCL_LIBNAME}
HINTS ${NCCL_LIB_DIR})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)

if(NCCL_FOUND) # obtaining NCCL version and some sanity checks
set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)

if (NCCL_VERSION_DEFINED)
set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
file(WRITE ${file} "
#include <iostream>
#include <nccl.h>
int main()
{
std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
int x;
ncclGetVersion(&x);
return x == NCCL_VERSION_CODE;
}
")
try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
LINK_LIBRARIES ${NCCL_LIBRARIES})
if (NOT NCCL_VERSION_MATCHED)
message(FATAL_ERROR "Found NCCL header version and library version do not match! \
(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
endif()
message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
else()
# message(STATUS "NCCL version < 2.3.5-5")
endif ()
set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})

message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
endif()
2 changes: 2 additions & 0 deletions src/04kernel/cuda/include/kernel/cuda/functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ namespace refactor::kernel::cuda {
int currentDevice();

void sync();

void setCudaDevice(int);

void copyOut(void *dst, const void *src, size_t size);

Expand Down
4 changes: 4 additions & 0 deletions src/04kernel/cuda/src/functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,8 @@ namespace refactor::kernel::cuda {
CUDA_ASSERT(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
}

void setCudaDevice(int id) {
cudaSetDevice(id);
}

}// namespace refactor::kernel::cuda
14 changes: 14 additions & 0 deletions src/04kernel/include/kernel/attributes/communication.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef KERNEL_COMMUNICATION_ATTRIBUTES_H
#define KERNEL_COMMUNICATION_ATTRIBUTES_H

namespace refactor::kernel {
enum class AllReduceType {
Sum,
Avg,
Min,
Max,
Prod
};
}

#endif
21 changes: 21 additions & 0 deletions src/04kernel/include/kernel/collectors/all_reduce.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#ifndef KERNEL_COLLECTOR_ALL_REDUCE_H
#define KERNEL_COLLECTOR_ALL_REDUCE_H

#include "../collector.h"
#include "kernel/attributes/communication.h"

namespace refactor::kernel {

struct AllReduceCollector final : public InfoCollector {

AllReduceType type;

constexpr AllReduceCollector(decltype(_target) target, AllReduceType type_) noexcept
: InfoCollector(target), type(type_) {}

std::vector<KernelBox>
filter(TensorRefs inputs, TensorRefs outputs) const final;
};
}// namespace refactor::kernel

#endif
20 changes: 20 additions & 0 deletions src/04kernel/src/collectors/all_reduce.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "kernel/collectors/all_reduce.h"
#include "../kernels/all_reduce/nccl_kernel.hh"
namespace refactor::kernel {
std::vector<KernelBox>
AllReduceCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
break;
case decltype(_target)::Nvidia:
if (auto ptr = AllReduceNccl::build(type, inputs[0], outputs[0]); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
}
return ans;
}
}// namespace refactor::kernel
32 changes: 32 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "nccl_kernel.hh"

namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;

K::AllReduceNccl(AllReduceType opType_, DT dataType_, size_t size_) noexcept
: opType(opType_), dataType(dataType_), size(size_) {}

auto K::build(AllReduceType opType_, Tensor const &input, Tensor const &output) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif
if (input.elementsSize() != output.elementsSize() ||
input.dataType != output.dataType) {
return nullptr;
}

return std::make_unique<K>(opType_, input.dataType, input.elementsSize());
}

auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t { return typeId(); }
auto K::description() const noexcept -> std::string_view {
return "Performing AllReduce using NCCL";
}

}// namespace refactor::kernel
20 changes: 20 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "nccl_kernel.hh"
#include "../../utilities/cuda/nccl_communicator.hh"
#include <nccl.h>
namespace refactor::kernel {
using K = AllReduceNccl;
using DT = DataType;
using namespace nccl;

auto K::lower(Resources &res) const noexcept -> RoutineWorkspace{
return [count = size,
redOp = getRedOp(opType),
ncclDataType = getNcclDataType(dataType)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
auto communicator = res.fetch<NcclCommunicator>();
auto input = inputs[0];
auto output = outputs[0];
checkNcclError(ncclAllReduce(input, output, count, ncclDataType,
redOp, communicator->get(), 0));// TODO: use default stream for now
};
}
}// namespace refactor::kernel
28 changes: 28 additions & 0 deletions src/04kernel/src/kernels/all_reduce/nccl_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef KERNEL_ALLREDUCE_NCCL_KERNEL_HH
#define KERNEL_ALLREDUCE_NCCL_KERNEL_HH

#include "kernel/collectors/all_reduce.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct AllReduceNccl final : public Kernel {
AllReduceType opType;
DataType dataType;
size_t size;

AllReduceNccl(AllReduceType, DataType, size_t) noexcept;

static KernelBox build(AllReduceType, Tensor const &, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_ALLREDUCE_NCCL_KERNEL_HH
61 changes: 61 additions & 0 deletions src/04kernel/src/utilities/cuda/nccl_communicator.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "common.h"
#include "nccl_communicator.hh"
#include <chrono>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <thread>


namespace refactor::kernel::nccl {
NcclCommunicator::NcclCommunicator(int worldSize, int rank) : worldSize_(worldSize), rank_(rank) {
const std::string filePath("./nccl_id.bin");

ncclUniqueId commId;

if (rank == 0) {
checkNcclError(ncclGetUniqueId(&commId));
std::ofstream ofs(filePath, std::ios::binary);
ofs.write((char *) &commId, sizeof(ncclUniqueId));

} else {
auto begin = std::chrono::steady_clock::now();
while (!std::filesystem::exists(filePath)) {
auto now = std::chrono::steady_clock::now();
ASSERT(now < begin + std::chrono::seconds(10),
"time limit (10s) exceeded.");
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
std::ifstream ifs(filePath, std::ios::binary);
ifs.read((char *) &commId, sizeof(ncclUniqueId));
}
checkNcclError(ncclCommInitRank(&comm, worldSize, commId, rank));

if (rank == 0) {
std::filesystem::remove(filePath);
}

printf("Rank %d established NCCL communicator.\n", rank);
}

NcclCommunicator::~NcclCommunicator() {
checkNcclError(ncclCommFinalize(comm));
checkNcclError(ncclCommDestroy(comm));
}

auto NcclCommunicator::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}
auto NcclCommunicator::build(int worldSize, int rank) noexcept -> runtime::ResourceBox {
return std::make_unique<NcclCommunicator>(worldSize, rank);
}

auto NcclCommunicator::resourceTypeId() const noexcept -> size_t {
return typeId();
}
auto NcclCommunicator::description() const noexcept -> std::string_view {
return "NcclCommunicator";
}

}// namespace refactor::kernel::nccl
Loading
Loading