From 7e1a96fb383c0d092a9837b972c699bb4a6e3603 Mon Sep 17 00:00:00 2001 From: Mark Dokter Date: Mon, 14 Oct 2024 18:09:54 +0200 Subject: [PATCH] [DAPHNE-#844] HPC Containers for multiple CUDA platforms - docker container that converts seamlessly to singularity The docker dev container contains an entrypoint script for ssh access that is not working in the singularity converted container without super user privileges. A separate daphne-dev-hpc container avoids this convenience functionality - compile cuda for all hardware generations --- containers/build-containers.sh | 28 ++++++------- containers/daphne-deps.Dockerfile | 3 +- containers/daphne-dev-hpc.Dockerfile | 51 ++++++++++++++++++++++++ containers/publish.sh | 8 ++-- software-package-versions.txt | 2 +- src/runtime/local/kernels/CMakeLists.txt | 1 + 6 files changed, 73 insertions(+), 20 deletions(-) create mode 100644 containers/daphne-dev-hpc.Dockerfile diff --git a/containers/build-containers.sh b/containers/build-containers.sh index 10e65020e..42d842c38 100755 --- a/containers/build-containers.sh +++ b/containers/build-containers.sh @@ -85,6 +85,7 @@ DAPHNE_TARGET=daphne-deps BASE_IMAGE=ubuntu:${ubuntuVersion} DAPHNE_TAG=$TIMESTAMP_DATE_${ARCH} IMAGE_REPO=daphneeu/$DAPHNE_TARGET +DAPHNE_BUILD_FLAGS="--hdfs --mpi" #bulid deps stage build_daphne -deps @@ -106,7 +107,6 @@ BASE_IMAGE=ubuntu:${ubuntuVersion} DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_BASE_ubuntu${ubuntuVersion} IMAGE_REPO=daphneeu/$DAPHNE_TARGET build_daphne -dev - $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_BASE #------------------------------------------------------------------------------ @@ -118,19 +118,8 @@ BASE_IMAGE=nvidia/cuda:$CUDA_TAG DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG} IMAGE_REPO=daphneeu/$DAPHNE_TARGET build_daphne -dev - $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_CUDA -#----------------------------------------------------------------------------- -# Images for DAPHNE development (OneAPI) -#------------------------------------------------------------------------------ -#DAPHNE_TARGET=daphne-dev -#ONEAPI_TAG=2023.1.0-devel-ubuntu${ubuntuVersion} -#BASE_IMAGE=intel/oneapi:$ONEAPI_TAG -#DAPHNE_TAG=${TIMESTAMP_DATE}_${ONEAPI_TAG} -#IMAGE_REPO=daphneeu/$DAPHNE_TARGET -#build_daphne -dev - #------------------------------------------------------------------------------ # Images for running DAPHNE #------------------------------------------------------------------------------ @@ -139,7 +128,7 @@ BASE_IMAGE=daphneeu/daphne-deps FINAL_BASE_IMAGE=ubuntu:${ubuntuVersion} DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_BASE_ubuntu${ubuntuVersion} IMAGE_REPO=daphneeu/$DAPHNE_TARGET -DAPHNE_BUILD_FLAGS="--mpi" +DAPHNE_BUILD_FLAGS="--hdfs --mpi" build_daphne $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne:latest_${ARCH}_BASE @@ -152,8 +141,19 @@ DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG} IMAGE_REPO=daphneeu/$DAPHNE_TARGET BASE_IMAGE=daphneeu/daphne-dev FINAL_BASE_IMAGE=nvidia/cuda:$CUDA_TAG -DAPHNE_BUILD_FLAGS="--mpi --cuda" +DAPHNE_BUILD_FLAGS="--hdfs --mpi --cuda" build_daphne $USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne:latest_${ARCH}_CUDA +#----------------------------------------------------------------------------- +# Images for conversion to singularity for DAPHNE compilation +#------------------------------------------------------------------------------ +DAPHNE_TARGET=daphne-dev-hpc +CUDA_TAG=${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion} +BASE_IMAGE=nvidia/cuda:$CUDA_TAG +DAPHNE_TAG=${TIMESTAMP_DATE}_${ARCH}_CUDA_${CUDA_TAG} +IMAGE_REPO=daphneeu/$DAPHNE_TARGET +build_daphne -dev-hpc +$USE_SUDO docker tag $IMAGE_REPO:$DAPHNE_TAG daphneeu/daphne-dev:latest_${ARCH}_HPC + set +e diff --git a/containers/daphne-deps.Dockerfile b/containers/daphne-deps.Dockerfile index b5b93de55..9a9131068 100644 --- a/containers/daphne-deps.Dockerfile +++ b/containers/daphne-deps.Dockerfile @@ -62,9 +62,10 @@ FROM build-cmake AS build ARG DAPHNE_DIR=/daphne ARG DAPHNE_REPO=https://github.com/daphne-eu/daphne.git ARG DAPHNE_BRANCH=main +ARG DAPHNE_BUILD_FLAGS="--mpi --hdfs" RUN git clone --depth=1 --single-branch --branch=$DAPHNE_BRANCH $DAPHNE_REPO $DAPHNE_DIR WORKDIR $DAPHNE_DIR -RUN ./build.sh --no-fancy --no-submodule-update --installPrefix /usr/local +RUN PATH=/usr/local/bin:$PATH LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH ./build.sh --no-fancy --no-submodule-update --installPrefix /usr/local $DAPHNE_BUILD_FLAGS RUN find /usr/local -exec file {} \; | grep -e "not stripped" | cut -d ":" -f 1 | xargs strip --strip-unneeded RUN rm -rf $DAPHNE_DIR RUN ldconfig diff --git a/containers/daphne-dev-hpc.Dockerfile b/containers/daphne-dev-hpc.Dockerfile new file mode 100644 index 000000000..5b4516d8e --- /dev/null +++ b/containers/daphne-dev-hpc.Dockerfile @@ -0,0 +1,51 @@ +# syntax=docker/dockerfile:1 + +# Copyright 2023 The DAPHNE Consortium +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# This Dockerfile provides a basic DAPHNE compilation environment with all +# third party dependencies precompiled (use ''./build.sh --no-deps --installPrefix /usr/local'' to compile DAPHNE) + +ARG BASE_IMAGE=ubuntu:20.04 +#ARG FINAL_BASE_IMAGE=ubuntu:20.04 +ARG CMAKE_VERSION=3.29.3 +ARG TIMESTAMP=0 +ARG TZ=Etc/UTC + +FROM ${BASE_IMAGE} AS daphne-dev-hpc +ARG DEBIAN_FRONTEND="noninteractive" +ARG TZ +RUN apt-get -qq -y update && apt-get -y upgrade && apt-get -y --no-install-recommends install \ + ca-certificates file git openssh-client unzip wget tar \ + libomp-dev libpfm4-dev libssl-dev libxml2-dev uuid-dev zlib1g-dev libgsasl-dev libkrb5-dev \ + build-essential clang gfortran lld llvm llvm-18-tools ninja-build openjdk-11-jdk-headless pkg-config python3-numpy python3-pandas \ + vim nano rsync sudo iputils-ping virtualenv openssh-server iproute2 git htop gdb lldb lld gpg-agent net-tools \ + software-properties-common ca-certificates file unzip wget tar zstd \ + ccache python3-pip python3-networkx python3-dev graphviz-dev clang-format \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY --from=daphneeu/daphne-deps /usr/local/bin/ /usr/local/bin/ +COPY --from=daphneeu/daphne-deps /usr/local/include/ /usr/local/include/ +COPY --from=daphneeu/daphne-deps /usr/local/lib/ /usr/local/lib/ +COPY --from=daphneeu/daphne-deps /usr/local/share/ /usr/local/share/ +RUN ldconfig +# this is a temporary workaround to make the lit code (from the llvm-*-tools package) available to some pre-Ubuntu24 \ +# test cases when run locally in the dev container +RUN ln -s /usr/lib/llvm-18 /usr/lib/llvm-10 +RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime +#COPY entrypoint-interactive.sh / +#RUN mkdir -p /var/run/sshd +#EXPOSE 22 +#ENTRYPOINT [ "/entrypoint-interactive.sh"] diff --git a/containers/publish.sh b/containers/publish.sh index 0de241a25..cd6d32e06 100755 --- a/containers/publish.sh +++ b/containers/publish.sh @@ -44,8 +44,8 @@ fi $USE_SUDO docker push -a daphneeu/github-action # cuda dev image -$USE_SUDO docker tag daphneeu/daphne-dev:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion} daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion} -$USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-devel-ubuntu${ubuntuVersion} +$USE_SUDO docker tag daphneeu/daphne-dev:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion} daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion} +$USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-devel-ubuntu${ubuntuVersion} $USE_SUDO docker push daphneeu/daphne-dev:latest_${ARCH}_CUDA # base dev image @@ -54,8 +54,8 @@ $USE_SUDO docker push daphneeu/daphne-dev:${VERSION}_${ARCH}_BASE_ubuntu${ubuntu $USE_SUDO docker push daphneeu/daphne-dev:latest_${ARCH}_BASE # cuda run image -$USE_SUDO docker tag daphneeu/daphne:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion} daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion} -$USE_SUDO docker push daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn8-runtime-ubuntu${ubuntuVersion} +$USE_SUDO docker tag daphneeu/daphne:${TIMESTAMP_DATE}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion} daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion} +$USE_SUDO docker push daphneeu/daphne:${VERSION}_${ARCH}_CUDA_${cudaVersion}-cudnn-runtime-ubuntu${ubuntuVersion} $USE_SUDO docker push daphneeu/daphne:latest_${ARCH}_CUDA # base run image diff --git a/software-package-versions.txt b/software-package-versions.txt index 9a4244fff..cda7b2841 100644 --- a/software-package-versions.txt +++ b/software-package-versions.txt @@ -19,7 +19,7 @@ abslVersion=20230802.1 antlrVersion=4.9.2 arrowVersion=13.0.0 catch2Version=2.13.8 -cmakeVersion=3.30.3 +cmakeVersion=3.30.5 cudaVersion=12.6.1 eigenVersion=3.4.0 grpcVersion=1.38.0 diff --git a/src/runtime/local/kernels/CMakeLists.txt b/src/runtime/local/kernels/CMakeLists.txt index 6d7d0cb08..22ce7a5c6 100644 --- a/src/runtime/local/kernels/CMakeLists.txt +++ b/src/runtime/local/kernels/CMakeLists.txt @@ -76,6 +76,7 @@ if(USE_CUDA AND CMAKE_CUDA_COMPILER) target_link_libraries(CUDAKernels PUBLIC DataStructures LLVMSupport MLIRDaphne MLIRDaphneTransforms CUDA::cudart CUDA::cublasLt CUDA::cublas CUDA::cusparse ${CUDA_cudnn_LIBRARY} CUDA::cusolver Util MLIRDaphneInference fmt::fmt) set_target_properties(CUDAKernels PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib) + set_property(TARGET CUDAKernels PROPERTY CUDA_ARCHITECTURES all) endif() execute_process(