Skip to content

Commit

Permalink
UCC PG build in CI (pytorch#81583)
Browse files Browse the repository at this point in the history
- Modifies the current cmake build definitions to use `find_package` to find UCX and UCC installed in the system
- Install UCX and UCC in CUDA dockers
- Build PyTorch with `USE_UCC=1` in pipelines
- Currently, we are not running unit tests with the UCC PG. Those tests will be added in future PRs.
Pull Request resolved: pytorch#81583
Approved by: https://github.com/vtlam, https://github.com/malfet
  • Loading branch information
zasdfgbnm authored and pytorchmergebot committed Aug 10, 2022
1 parent b4f7e22 commit cda210e
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 22 deletions.
12 changes: 12 additions & 0 deletions .circleci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ if [[ "$image" == *xenial* ]] || [[ "$image" == *bionic* ]]; then
fi

TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/14.04/x86_64"
UCX_COMMIT=v1.13.x
UCC_COMMIT=a7bda274b10f8adf5bb729f01da064f4e735fb23

# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
Expand Down Expand Up @@ -147,6 +149,8 @@ case "$image" in
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-bionic-cuda11.7-cudnn8-py3-gcc7)
CUDA_VERSION=11.7.0
Expand All @@ -157,6 +161,8 @@ case "$image" in
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-xenial-py3-clang5-asan)
ANACONDA_PYTHON_VERSION=3.7
Expand Down Expand Up @@ -277,6 +283,8 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
pytorch-linux-jammy-cuda11.7-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
Expand All @@ -286,6 +294,8 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
UCX_COMMIT=${UCX_COMMIT}
UCC_COMMIT=${UCC_COMMIT}
;;
*)
# Catch-all for builds that are not hardcoded.
Expand Down Expand Up @@ -375,6 +385,8 @@ docker build \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx900;gfx906}" \
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \
Expand Down
3 changes: 2 additions & 1 deletion .circleci/docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ install_ubuntu() {
wget \
sudo \
vim \
jq
jq \
libtool

# Should resolve issues related to various apt package repository cert issues
# see: https://github.com/pytorch/pytorch/issues/65931
Expand Down
41 changes: 41 additions & 0 deletions .circleci/docker/common/install_ucc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

set -ex

function install_ucx() {
set -ex
git clone --recursive https://github.com/openucx/ucx.git
pushd ucx
git checkout ${UCX_COMMIT}
git submodule update --init --recursive

./autogen.sh
./configure --prefix=$UCX_HOME \
--enable-mt \
--enable-profiling \
--enable-stats
time make -j
sudo make install

popd
rm -rf ucx
}

function install_ucc() {
set -ex
git clone --recursive https://github.com/openucx/ucc.git
pushd ucc
git checkout ${UCC_COMMIT}
git submodule update --init --recursive

./autogen.sh
./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-nccl=no
time make -j
sudo make install

popd
rm -rf ucc
}

install_ucx
install_ucc
11 changes: 11 additions & 0 deletions .circleci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh
ENV INSTALLED_VISION ${VISION}

# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh

COPY ./common/install_openssl.sh install_openssl.sh
ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh
Expand Down
11 changes: 11 additions & 0 deletions .circleci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,17 @@ RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh

# (optional) Install protobuf for ONNX
ARG PROTOBUF
COPY ./common/install_protobuf.sh install_protobuf.sh
Expand Down
4 changes: 4 additions & 0 deletions .jenkins/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
# enable split torch_cuda build option in CMake
export BUILD_SPLIT_CUDA=ON
if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* ]]; then
export USE_UCC=1
export USE_SYSTEM_UCC=1
fi
fi

if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* || ${BUILD_ENVIRONMENT} == *"onnx"* ]]; then
Expand Down
5 changes: 0 additions & 5 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,6 @@ if(HAVE_SOVERSION)
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
endif()

if(USE_UCC)
target_link_libraries(torch_cpu PRIVATE __caffe2_ucc)
target_compile_definitions(torch_cpu PRIVATE USE_UCC)
endif()

if(USE_ROCM)
filter_list(__caffe2_hip_srcs_cpp Caffe2_HIP_SRCS "\\.(cu|hip)$")
set_source_files_properties(${__caffe2_hip_srcs_cpp} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
Expand Down
19 changes: 7 additions & 12 deletions cmake/External/ucc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,14 @@ if(NOT __UCC_INCLUDED)
set(__UCC_INCLUDED TRUE)

if(USE_SYSTEM_UCC)
set(UCX_HOME $ENV{UCX_HOME} CACHE PATH "UCX install directory")
set(UCC_HOME $ENV{UCC_HOME} CACHE PATH "UCC install directory")

add_library(__caffe2_ucc INTERFACE)

target_include_directories(__caffe2_ucc INTERFACE ${UCX_HOME}/include/)
target_include_directories(__caffe2_ucc INTERFACE ${UCC_HOME}/include/)

target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucp.so)
target_link_libraries(__caffe2_ucc INTERFACE ${UCX_HOME}/lib/libucs.so)
target_link_libraries(__caffe2_ucc INTERFACE ${UCC_HOME}/lib/libucc.so)
find_package(UCC REQUIRED)
find_package(UCX REQUIRED)
if(UCC_FOUND AND UCX_FOUND)
add_library(__caffe2_ucc INTERFACE)
target_link_libraries(__caffe2_ucc INTERFACE ucx::ucs ucx::ucp ucc::ucc)
target_include_directories(__caffe2_ucc INTERFACE ${UCC_INCLUDE_DIRS})
endif()
else()
message(FATAL_ERROR "USE_SYSTEM_UCC=OFF is not supported yet when using UCC")
endif()

endif()
3 changes: 0 additions & 3 deletions torch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,6 @@ if(USE_DISTRIBUTED)
if(USE_NCCL)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
endif()
if(USE_UCC)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_ucc)
endif()
# Same for MPI.
if(USE_MPI)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
Expand Down
2 changes: 1 addition & 1 deletion torch/distributed/distributed_c10d.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@

try:
from torch._C._distributed_c10d import ProcessGroupUCC
ProcessGroupUCC.__module__ = "torch.distributed.distributed_c10d"
except ImportError:
_UCC_AVAILABLE = False


logger = logging.getLogger(__name__)

PG_WRAPPER_STORE_PREFIX = "pg_wrapper"
Expand Down

0 comments on commit cda210e

Please sign in to comment.