From 81bd2f433c9a2c143497976908435255c1e2cda2 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 12:18:45 -0800 Subject: [PATCH] [gpu] Exercise new template-generated GPU driver installer --- cloudbuild/Dockerfile | 3 +- gpu/install_gpu_driver.sh | 993 ++++++++++++------------- gpu/test_gpu.py | 6 - templates/gpu/install_gpu_driver.sh.in | 1 + 4 files changed, 482 insertions(+), 521 deletions(-) diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile index aebaffd84..644219305 100644 --- a/cloudbuild/Dockerfile +++ b/cloudbuild/Dockerfile @@ -22,7 +22,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \ dd of="${bazel_repo_file}" status=none && \ apt-get update -qq RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \ - apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \ + apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} \ + libtemplate-perl > /dev/null 2>&1 && \ apt-get clean # Set bazel-${bazel_version} as the default bazel alternative in this container diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 91ad4ede0..07929ca59 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -190,45 +190,6 @@ function set_hadoop_property() { --clobber } -function configure_yarn_resources() { - if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts - if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then - printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" - fi - set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' - - set_hadoop_property 'capacity-scheduler.xml' \ - 'yarn.scheduler.capacity.resource-calculator' \ - 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' - - set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' -} - -# This configuration should be applied only if GPU is attached to the node -function configure_yarn_nodemanager() { - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' - set_hadoop_property 'yarn-site.xml' \ - 'yarn.nodemanager.container-executor.class' \ - 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' - set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - - # Fix local dirs access permissions - local yarn_local_dirs=() - - readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ - --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ - --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - - if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" - fi -} - function clean_up_sources_lists() { # # bigtop (primary) @@ -364,7 +325,7 @@ function is_ramdisk() { function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" - if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi + if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi # Write to a ramdisk instead of churning the persistent disk @@ -396,79 +357,6 @@ function check_os() { exit 1 fi - SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - readonly SPARK_VERSION - if version_lt "${SPARK_VERSION}" "3.1" || \ - version_ge "${SPARK_VERSION}" "4.0" ; then - echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." - exit 1 - fi - - # Detect dataproc image version - if (! test -v DATAPROC_IMAGE_VERSION) ; then - if test -v DATAPROC_VERSION ; then - DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" - else - if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" - elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" - elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" - else echo "Unknown dataproc image version" ; exit 1 ; fi - fi - fi -} - -# -# Generate repo file under /etc/apt/sources.list.d/ -# -function apt_add_repo() { - local -r repo_name="$1" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local -r include_src="${4:-yes}" - local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" - - echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" - if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" - fi - - apt-get update -qq -} - -# -# Generate repo file under /etc/yum.repos.d/ -# -function dnf_add_repo() { - local -r repo_name="$1" - local -r repo_url="$3" # "http(s)://host/path/filename.repo" - local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" - local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - - curl -s -L "${repo_url}" \ - | dd of="${repo_path}" status=progress -# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ -} - -# -# Keyrings default to -# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or -# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) -# -function os_add_repo() { - local -r repo_name="$1" - local -r signing_key_url="$2" - local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" - local kr_path - if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" - else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi - - mkdir -p "$(dirname "${kr_path}")" - - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ - | gpg --import --no-default-keyring --keyring "${kr_path}" - - if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" - else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi } function configure_dkms_certs() { @@ -581,19 +469,35 @@ function restart_knox() { systemctl start knox } +function is_complete() { + phase="$1" + test -f "${workdir}/complete/${phase}" +} + +function mark_complete() { + phase="$1" + touch "${workdir}/complete/${phase}" +} + +function mark_incomplete() { + phase="$1" + rm -f "${workdir}/complete/${phase}" +} + function install_dependencies() { - test -f "${workdir}/complete/install-dependencies" && return 0 + is_complete install-dependencies && return 0 + pkg_list="screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi - touch "${workdir}/complete/install-dependencies" + mark_complete install-dependencies } function prepare_pip_env() { # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge - test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv" - source "${tmpdir}/python-venv/bin/activate" + test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv" + source "${workdir}/python-venv/bin/activate" pip cache purge || echo "unable to purge pip cache" if is_ramdisk ; then @@ -603,44 +507,42 @@ function prepare_pip_env() { fi } +function prepare_conda_env() { + CONDA=/opt/conda/miniconda3/bin/conda + touch ~/.condarc + cp ~/.condarc ~/.condarc.default + if is_ramdisk ; then + # Download conda packages to tmpfs + mkdir -p "${tmpdir}/conda_cache" + ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache" + fi +} function prepare_common_env() { - define_os_comparison_functions - # Verify OS compatability and Secure boot state check_os check_secure_boot - readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" - - # Dataproc configurations - readonly HADOOP_CONF_DIR='/etc/hadoop/conf' - readonly HIVE_CONF_DIR='/etc/hive/conf' - readonly SPARK_CONF_DIR='/etc/spark/conf' - + # read-only configuration variables + _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" + HADOOP_CONF_DIR='/etc/hadoop/conf' + HIVE_CONF_DIR='/etc/hive/conf' OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" - readonly OS_NAME - - # node role ROLE="$(get_metadata_attribute dataproc-role)" - readonly ROLE - - # master node MASTER="$(get_metadata_attribute dataproc-master)" - readonly MASTER - workdir=/opt/install-dpgce - tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" - readonly temp_bucket - readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" + pkg_bucket="gs://${temp_bucket}/dpgce-packages" uname_r=$(uname -r) - readonly uname_r - readonly bdcfg="/usr/local/bin/bdconfig" - export DEBIAN_FRONTEND=noninteractive + bdcfg="/usr/local/bin/bdconfig" + KNOX_HOME=/usr/lib/knox - # Knox config - readonly KNOX_HOME=/usr/lib/knox + readonly HADOOP_CONF_DIR HIVE_CONF_DIR OS_NAME ROLE MASTER workdir + readonly temp_bucket pkg_bucket uname_r bdconfig KNOX_HOME + + tmpdir=/tmp/ + + export DEBIAN_FRONTEND=noninteractive mkdir -p "${workdir}/complete" set_proxy @@ -648,7 +550,7 @@ function prepare_common_env() { readonly install_log="${tmpdir}/install.log" - if test -f "${workdir}/complete/prepare.common" ; then return ; fi + is_complete prepare.common && return repair_old_backports @@ -666,23 +568,24 @@ function prepare_common_env() { dnf clean all fi - # zero free disk space - if [[ -n "$(get_metadata_attribute creating-image)" ]]; then + # When creating a disk image: + if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then + df / > "/run/disk-usage.log" - ( set +e + # zero free disk space + ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) install_dependencies # Monitor disk usage in a screen session - df / > "/run/disk-usage.log" touch "/run/keep-running-df" screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" fi - touch "${workdir}/complete/prepare.common" + mark_complete prepare.common } function pip_exit_handler() { @@ -692,29 +595,22 @@ function pip_exit_handler() { fi } +function conda_exit_handler() { + mv ~/.condarc.default ~/.condarc +} + function common_exit_handler() { set +ex echo "Exit handler invoked" - # Restart YARN services if they are running already - for svc in resourcemanager nodemanager; do - if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then - systemctl stop "hadoop-yarn-${svc}.service" - systemctl start "hadoop-yarn-${svc}.service" - fi - done - # If system memory was sufficient to mount memory-backed filesystems - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then + if is_ramdisk ; then # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done - - # restart services stopped during preparation stage - # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi if is_debuntu ; then @@ -787,6 +683,63 @@ print( " samples-taken: ", scalar @siz, $/, echo "exit_handler has completed" } +define_os_comparison_functions + + +# +# Generate repo file under /etc/apt/sources.list.d/ +# +function apt_add_repo() { + local -r repo_name="$1" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local -r include_src="${4:-yes}" + local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" + + echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" + if [[ "${include_src}" == "yes" ]] ; then + echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + fi + + apt-get update -qq +} + +# +# Generate repo file under /etc/yum.repos.d/ +# +function dnf_add_repo() { + local -r repo_name="$1" + local -r repo_url="$3" # "http(s)://host/path/filename.repo" + local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" + local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" + + curl -s -L "${repo_url}" \ + | dd of="${repo_path}" status=progress +# | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \ +} + +# +# Keyrings default to +# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or +# /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) +# +function os_add_repo() { + local -r repo_name="$1" + local -r signing_key_url="$2" + local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" + local kr_path + if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" + else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi + + mkdir -p "$(dirname "${kr_path}")" + + curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \ + | gpg --import --no-default-keyring --keyring "${kr_path}" + + if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" + else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi +} + function set_support_matrix() { # CUDA version and Driver version @@ -839,8 +792,6 @@ function set_support_matrix() { ) } -set_support_matrix - function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) @@ -930,50 +881,133 @@ function set_driver_version() { fi } -function set_cudnn_version() { - readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" - readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" - - # Parameters for NVIDIA-provided cuDNN library - readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} - CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") - # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} - if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then - CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" - elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then - # cuDNN v8 is not distribution for ubuntu20+, debian12 - CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" - elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then - # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 - CUDNN_VERSION="8.8.0.121" - fi - readonly CUDNN_VERSION -} +function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) +function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) +function nvsmi() { + local nvsmi="/usr/bin/nvidia-smi" + if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' + elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0 + elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 + else nvsmi_works="1" ; fi -function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) -function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) + if test -v 1 && [[ "$1" == "-L" ]] ; then + local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" + if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" + else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi -function set_cuda_repo_shortname() { -# Short name for urls -# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} - if is_rocky ; then - shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" - else - shortname="$(os_id)$(os_vercat)" + return 0 fi -} - -function set_nv_urls() { - # Parameters for NVIDIA-provided package repositories - readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' - readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" - # Parameter for NVIDIA-provided Rocky Linux GPU driver - readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" + "${nvsmi}" $* } -function set_cuda_runfile_url() { +function clear_nvsmi_cache() { + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then + rm "${nvsmi_query_xml}" + fi +} + +function query_nvsmi() { + if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi + if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi + nvsmi -q -x --dtd > "${nvsmi_query_xml}" +} + +function prepare_gpu_env(){ + set_support_matrix + + set_cuda_version + set_driver_version + + set +e + gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" + set -e + echo "gpu_count=[${gpu_count}]" + nvsmi_works="0" + nvsmi_query_xml="${tmpdir}/nvsmi.xml" + xmllint="/opt/conda/miniconda3/bin/xmllint" + NVIDIA_SMI_PATH='/usr/bin' + MIG_MAJOR_CAPS=0 + IS_MIG_ENABLED=0 + CUDNN_PKG_NAME="" + CUDNN8_PKG_NAME="" + CUDA_LOCAL_REPO_INSTALLED="0" + + if ! test -v DEFAULT_RAPIDS_RUNTIME ; then + readonly DEFAULT_RAPIDS_RUNTIME='SPARK' + fi + + # Verify SPARK compatability + RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}") + readonly RAPIDS_RUNTIME + + # determine whether we have nvidia-smi installed and working + nvsmi +} + +# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades +# Users should run apt-mark unhold before they wish to upgrade these packages +function hold_nvidia_packages() { + if ! is_debuntu ; then return ; fi + + apt-mark hold nvidia-* + apt-mark hold libnvidia-* + if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then + apt-mark hold xserver-xorg-video-nvidia* + fi +} + +function gpu_exit_handler() { + echo "no operations in gpu exit handler" +} + + +function set_cudnn_version() { + readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" + readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" + + # Parameters for NVIDIA-provided cuDNN library + DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} + readonly DEFAULT_CUDNN_VERSION + CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") + # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} + if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then + CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" + elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then + # cuDNN v8 is not distribution for ubuntu20+, debian12 + CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" + elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then + # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 + CUDNN_VERSION="8.8.0.121" + fi + readonly CUDNN_VERSION +} + + +function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) +function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) + +function set_cuda_repo_shortname() { +# Short name for urls +# https://developer.download.nvidia.com/compute/cuda/repos/${shortname} + if is_rocky ; then + shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" + else + shortname="$(os_id)$(os_vercat)" + fi +} + +function set_nv_urls() { + # Parameters for NVIDIA-provided package repositories + readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' + readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" + + # Parameter for NVIDIA-provided Rocky Linux GPU driver + readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" +} + +function set_cuda_runfile_url() { local MAX_DRIVER_VERSION local MAX_CUDA_VERSION @@ -1095,7 +1129,7 @@ function uninstall_cuda_keyring_pkg() { } function install_local_cuda_repo() { - if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi + is_complete install-local-cuda-repo && return if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" @@ -1118,7 +1152,7 @@ function install_local_cuda_repo() { -o /etc/apt/preferences.d/cuda-repository-pin-600 fi - touch "${workdir}/complete/install-local-cuda-repo" + mark_complete install-local-cuda-repo } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" @@ -1126,7 +1160,8 @@ function uninstall_local_cuda_repo(){ } function install_local_cudnn_repo() { - if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi + is_complete install-local-cudnn-repo && return + pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" @@ -1142,7 +1177,7 @@ function install_local_cudnn_repo() { cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn-repo" + mark_complete install-local-cudnn-repo } function uninstall_local_cudnn_repo() { @@ -1151,7 +1186,7 @@ function uninstall_local_cudnn_repo() { } function install_local_cudnn8_repo() { - if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi + is_complete install-local-cudnn8-repo && return if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" @@ -1185,19 +1220,19 @@ function install_local_cudnn8_repo() { rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings - touch "${workdir}/complete/install-local-cudnn8-repo" + mark_complete install-local-cudnn8-repo } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" - rm -f "${workdir}/complete/install-local-cudnn8-repo" + mark_incomplete install-local-cudnn8-repo } function install_nvidia_nccl() { readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) - if test -f "${workdir}/complete/nccl" ; then return ; fi + is_complete nccl && return if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" @@ -1288,14 +1323,12 @@ function install_nvidia_nccl() { fi popd - touch "${workdir}/complete/nccl" + mark_complete nccl } -function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) -function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) - function install_nvidia_cudnn() { - if test -f "${workdir}/complete/cudnn" ; then return ; fi + is_complete cudnn && return + local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version @@ -1354,7 +1387,7 @@ function install_nvidia_cudnn() { ldconfig echo "NVIDIA cuDNN successfully installed for ${_shortname}." - touch "${workdir}/complete/cudnn" + mark_complete cudnn } function add_nonfree_components() { @@ -1511,7 +1544,8 @@ function install_nvidia_userspace_runfile() { # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. - if test -f "${workdir}/complete/userspace" ; then return ; fi + is_complete userspace && return + local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ @@ -1569,7 +1603,7 @@ function install_nvidia_userspace_runfile() { depmod -a else clear_dkms_key - tar czvf "${local_tarball}" \ + tar czf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" @@ -1577,12 +1611,13 @@ function install_nvidia_userspace_runfile() { fi rm -f "${local_fn}" - touch "${workdir}/complete/userspace" + mark_complete userspace sync } function install_cuda_runfile() { - if test -f "${workdir}/complete/cuda" ; then return ; fi + is_complete cuda && return + local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ @@ -1591,7 +1626,7 @@ function install_cuda_runfile() { execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" - touch "${workdir}/complete/cuda" + mark_complete cuda sync } @@ -1615,9 +1650,12 @@ function install_cuda_toolkit() { } function load_kernel_module() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ( set +e + rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + ) done depmod -a @@ -1629,7 +1667,8 @@ function load_kernel_module() { } function install_cuda(){ - if test -f "${workdir}/complete/cuda-repo" ; then return ; fi + is_complete cuda-repo && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" @@ -1642,10 +1681,12 @@ function install_cuda(){ # Includes CUDA packages add_repo_cuda - touch "${workdir}/complete/cuda-repo" + mark_complete cuda-repo } function install_nvidia_container_toolkit() { + is_complete install-nvtk && return + local container_runtime_default if command -v docker ; then container_runtime_default='docker' elif command -v containerd ; then container_runtime_default='containerd' @@ -1661,11 +1702,14 @@ function install_nvidia_container_toolkit() { execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" + + mark_complete install-nvtk } # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { - if test -f "${workdir}/complete/gpu-driver" ; then return ; fi + is_complete gpu-driver && return + if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components @@ -1687,11 +1731,11 @@ function install_nvidia_gpu_driver() { build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" - touch "${workdir}/complete/gpu-driver" + mark_complete gpu-driver } function install_ops_agent(){ - if test -f "${workdir}/complete/ops-agent" ; then return ; fi + is_complete ops-agent && return mkdir -p /opt/google cd /opt/google @@ -1699,11 +1743,12 @@ function install_ops_agent(){ curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install - touch "${workdir}/complete/ops-agent" + is_complete ops-agent } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_monitoring_agent() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi download_gpu_monitoring_agent install_gpu_monitoring_agent_dependency start_gpu_monitoring_agent_service @@ -1724,9 +1769,12 @@ function download_gpu_monitoring_agent(){ function install_gpu_monitoring_agent_dependency(){ cd /opt/google/compute-gpu-monitoring/linux - python3 -m venv venv - venv/bin/pip install wheel - venv/bin/pip install -Ur requirements.txt + /opt/conda/miniconda3/bin/python3 -m venv venv + ( + source venv/bin/activate + pip install wheel + pip install -Ur requirements.txt + ) } function start_gpu_monitoring_agent_service(){ @@ -1752,7 +1800,7 @@ function install_gpu_agent() { | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" - python3 -m venv "${venv}" + /opt/conda/miniconda3/bin/python3 -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip @@ -1784,6 +1832,7 @@ EOF } function configure_gpu_exclusive_mode() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU @@ -1791,52 +1840,139 @@ function configure_gpu_exclusive_mode() { clear_nvsmi_cache } -function fetch_mig_scripts() { - mkdir -p /usr/local/yarn-mig-scripts - chmod 755 /usr/local/yarn-mig-scripts - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi - wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh - chmod 755 /usr/local/yarn-mig-scripts/* +function install_build_dependencies() { + is_complete build-dependencies && return + + if is_debuntu ; then + if is_ubuntu22 && is_cuda12 ; then + # On ubuntu22, the default compiler does not build some kernel module versions + # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 + execute_with_retries apt-get install -y -qq gcc-12 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 + update-alternatives --set gcc /usr/bin/gcc-12 + fi + + elif is_rocky ; then + execute_with_retries dnf -y -q install gcc + + local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" + set +e + eval "${dnf_cmd}" > "${install_log}" 2>&1 + local retval="$?" + set -e + + if [[ "${retval}" == "0" ]] ; then return ; fi + + if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then + # this kernel-devel may have been migrated to the vault + local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" + local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" + dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ + "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ + "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" + )" + fi + + execute_with_retries "${dnf_cmd}" + fi + mark_complete build-dependencies } -function install_spark_rapids() { - # Update SPARK RAPIDS config - local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" - local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 +function install_gpu_driver_and_cuda() { + install_nvidia_gpu_driver + install_cuda + load_kernel_module +} - # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu - local -r scala_ver="2.12" +function prepare_gpu_install_env() { + # Whether to install NVIDIA-provided or OS-provided GPU driver + GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') + readonly GPU_DRIVER_PROVIDER - if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then - local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 + # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver + INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') + readonly INSTALL_GPU_AGENT + + set_cuda_repo_shortname + set_nv_urls + set_cuda_runfile_url + set_cudnn_version + set_cudnn_tarball_url + + if is_cuda11 ; then gcc_ver="11" + elif is_cuda12 ; then gcc_ver="12" ; fi +} + +function gpu_install_exit_handler() { + if is_ramdisk ; then + for shmdir in /var/cudnn-local ; do + if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then + umount -f ${shmdir} + fi + done fi + hold_nvidia_packages +} - readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) - readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) - local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' - local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' - local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' +# This configuration should be applied only if GPU is attached to the node +function configure_yarn_nodemanager() { + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' + set_hadoop_property 'yarn-site.xml' \ + 'yarn.nodemanager.container-executor.class' \ + 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' + set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' - local jar_basename + # Fix local dirs access permissions + local yarn_local_dirs=() - jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ + --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ + --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" - cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi +} - jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" - cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ - "/usr/lib/spark/jars/${jar_basename}" +function yarn_exit_handler() { + # Restart YARN services if they are running already + for svc in resourcemanager nodemanager; do + if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then + systemctl stop "hadoop-yarn-${svc}.service" + systemctl start "hadoop-yarn-${svc}.service" + fi + done + # restart services stopped during preparation stage + # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' +} + + +function configure_yarn_gpu_resources() { + if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts + if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then + printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" + fi + set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' + + set_hadoop_property 'capacity-scheduler.xml' \ + 'yarn.scheduler.capacity.resource-calculator' \ + 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' + + set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' } function configure_gpu_script() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} @@ -1911,6 +2047,7 @@ EOF } function configure_yarn_nodemanager_gpu() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' @@ -1920,6 +2057,7 @@ function configure_yarn_nodemanager_gpu() { } function configure_gpu_isolation() { + if [[ "${gpu_count}" == "0" ]] ; then return ; fi # enable GPU isolation sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg" if [[ $IS_MIG_ENABLED -ne 0 ]]; then @@ -1947,292 +2085,115 @@ EOF systemctl start dataproc-cgroup-device-permissions } -function nvsmi() { - local nvsmi="/usr/bin/nvidia-smi" - if [[ "${nvsmi_works}" == "1" ]] ; then echo -n '' - elif [[ ! -f "${nvsmi}" ]] ; then echo "nvidia-smi not installed" >&2 ; return 0 - elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 - else nvsmi_works="1" ; fi - - if test -v 1 && [[ "$1" == "-L" ]] ; then - local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" - if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" - else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi +function setup_gpu_yarn() { + # This configuration should be run on all nodes + # regardless if they have attached GPUs + configure_yarn_gpu_resources + # When there is no GPU, but the installer is executing on a master node: + if [[ "${gpu_count}" == "0" ]] ; then + if [[ "${ROLE}" == "Master" ]]; then + configure_yarn_nodemanager + fi return 0 fi - "${nvsmi}" $* -} - -function clear_nvsmi_cache() { - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then - rm "${nvsmi_query_xml}" - fi -} - -function query_nvsmi() { - if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi - if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi - nvsmi -q -x --dtd > "${nvsmi_query_xml}" -} - -function install_build_dependencies() { - if test -f "${workdir}/complete/build-dependencies" ; then return ; fi - - if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then - # On ubuntu22, the default compiler does not build some kernel module versions - # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 - execute_with_retries apt-get install -y -qq gcc-12 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 - update-alternatives --set gcc /usr/bin/gcc-12 - fi - - elif is_rocky ; then - execute_with_retries dnf -y -q install gcc - - local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" - set +e - eval "${dnf_cmd}" > "${install_log}" 2>&1 - local retval="$?" - set -e - - if [[ "${retval}" == "0" ]] ; then return ; fi - - if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then - # this kernel-devel may have been migrated to the vault - local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" - local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" - dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ - "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ - "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" - )" - fi - - execute_with_retries "${dnf_cmd}" + install_nvidia_container_toolkit + configure_yarn_nodemanager_gpu + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + configure_gpu_script fi - touch "${workdir}/complete/build-dependencies" -} - -function prepare_gpu_env(){ - set +e - gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" - set -e - echo "gpu_count=[${gpu_count}]" - nvsmi_works="0" - nvsmi_query_xml="${tmpdir}/nvsmi.xml" - xmllint="/opt/conda/miniconda3/bin/xmllint" - NVIDIA_SMI_PATH='/usr/bin' - MIG_MAJOR_CAPS=0 - IS_MIG_ENABLED=0 - CUDNN_PKG_NAME="" - CUDNN8_PKG_NAME="" - CUDA_LOCAL_REPO_INSTALLED="0" - - # Whether to install NVIDIA-provided or OS-provided GPU driver - GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') - readonly GPU_DRIVER_PROVIDER - - # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver - INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') - readonly INSTALL_GPU_AGENT - - # Verify SPARK compatability - RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') - readonly RAPIDS_RUNTIME - - # determine whether we have nvidia-smi installed and working - nvsmi - - set_cuda_version - set_driver_version - set_cuda_repo_shortname - set_nv_urls - set_cuda_runfile_url - set_cudnn_version - set_cudnn_tarball_url - - if is_cuda11 ; then gcc_ver="11" - elif is_cuda12 ; then gcc_ver="12" ; fi + configure_gpu_isolation } -# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades -# Users should run apt-mark unhold before they wish to upgrade these packages -function hold_nvidia_packages() { - if ! is_debuntu ; then return ; fi - apt-mark hold nvidia-* - apt-mark hold libnvidia-* - if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then - apt-mark hold xserver-xorg-video-nvidia* - fi +function download_spark_jar() { + local -r url=$1 + local -r jar_name=${url##*/} + curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ + "${url}" -o "${SPARK_JARS_DIR}/${jar_name}" } -function delete_mig_instances() ( - # delete all instances - set +e - nvidia-smi mig -dci - - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No compute instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac +function install_spark_rapids() { + # Update SPARK RAPIDS config + local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" + local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 - nvidia-smi mig -dgi - case "${?}" in - "0" ) echo "compute instances deleted" ;; - "2" ) echo "invalid argument" ;; - "6" ) echo "No GPU instances found to delete" ;; - * ) echo "unrecognized return code" ;; - esac -) + # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu + local -r scala_ver="2.12" -# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles -function configure_mig_cgi() { - delete_mig_instances - META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')" - if test -n "${META_MIG_CGI_VALUE}"; then - nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C - else - # https://pci-ids.ucw.cz/v2.2/pci.ids - local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)" - if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then - # run the following command to list placement profiles - # nvidia-smi mig -lgipp - # - # This is the result when using H100 instances on 20241220 - # GPU 0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1 - # GPU 0 Profile ID 15 Placements: {0,2,4,6}:2 - # GPU 0 Profile ID 14 Placements: {0,2,4}:2 - # GPU 0 Profile ID 9 Placements: {0,4}:4 - # GPU 0 Profile ID 5 Placement : {0}:4 - # GPU 0 Profile ID 0 Placement : {0}:8 - - # For H100 3D controllers, consider profile 19, 7x1G instances - nvidia-smi mig -cgi 9,9 -C - elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then - # Dataproc only supports H100s right now ; split in 2 if not specified - # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances - nvidia-smi mig -cgi 9,9 -C - else - echo "unrecognized 3D controller" - fi + if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then + local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi - clear_nvsmi_cache -} - -function enable_mig() { - if test -f "${workdir}/complete/enable-mig" ; then return ; fi - # Start persistenced if it's not already running - if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi - for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do - # Write an ascii zero to the numa node indicator - echo "0" | dd of="${f}" status=none - done - time nvsmi --gpu-reset # 30s - nvsmi -mig 1 - clear_nvsmi_cache - - touch "${workdir}/complete/enable-mig" -} + readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) + readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) -function enable_and_configure_mig() { - # default MIG to on when this script is used - META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") + local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' + local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' + local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' - if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi + local jar_basename - enable_mig - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")" + jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" - if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs. Failing" ; exit 1 ; fi - if ! (echo "${mig_mode_current}" | grep Enabled) ; then echo "MIG is configured but NOT enabled. Failing" ; exit 1 ; fi + jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" + cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" - echo "MIG is fully enabled" - configure_mig_cgi + jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" + cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ + "/usr/lib/spark/jars/${jar_basename}" } -function setup_gpu_yarn() { - # This configuration should be run on all nodes - # regardless if they have attached GPUs - configure_yarn_resources +function prepare_spark_env() { + SPARK_NLP_VERSION="3.2.1" # Must include subminor version here + SPARK_JARS_DIR=/usr/lib/spark/jars + SPARK_CONF_DIR='/etc/spark/conf' + SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")" + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" - # When there is no GPU, but the installer is executing on a master node: - if [[ "${gpu_count}" == "0" ]] ; then - if [[ "${ROLE}" == "Master" ]]; then - configure_yarn_nodemanager - fi - return 0 - fi + readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION - if [[ "${nvsmi_works}" == "1" ]] ; then - # if this is called without the MIG script then the drivers are not installed - query_nvsmi - local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()' - set +e - migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')" - set -e - NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" - - if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then - if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then - if (echo "${migquery_result}" | grep Enabled); then - IS_MIG_ENABLED=1 - NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' - MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1` - fetch_mig_scripts - fi - fi - fi + if version_lt "${SPARK_VERSION}" "3.1" || \ + version_ge "${SPARK_VERSION}" "4.0" ; then + echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." + exit 1 fi - # if mig is enabled drivers would have already been installed - if [[ $IS_MIG_ENABLED -eq 0 ]]; then - install_nvidia_gpu_driver - install_cuda - load_kernel_module - - #Install GPU metrics collection in Stackdriver if needed - if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then - install_gpu_agent -# install_gpu_monitoring_agent - echo 'GPU metrics agent successfully deployed.' + # Detect dataproc image version + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" else - echo 'GPU metrics agent has not been installed.' + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi fi - configure_gpu_exclusive_mode fi - install_nvidia_container_toolkit - configure_yarn_nodemanager_gpu - configure_gpu_script - configure_gpu_isolation -} - -function gpu_exit_handler() { - if [[ "${tmpdir}" == "/mnt/shm" ]] ; then - for shmdir in /var/cudnn-local ; do - if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then - umount -f ${shmdir} - fi - done - fi - hold_nvidia_packages } function main() { + install_gpu_driver_and_cuda + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + setup_gpu_yarn echo "yarn setup complete" @@ -2258,8 +2219,11 @@ function main() { } function exit_handler() { + set +e + gpu_install_exit_handler gpu_exit_handler pip_exit_handler + yarn_exit_handler common_exit_handler return 0 } @@ -2268,6 +2232,7 @@ function prepare_to_install(){ prepare_common_env prepare_pip_env prepare_gpu_env + prepare_gpu_install_env trap exit_handler EXIT } diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index 1f3328eaa..412d16ddb 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -107,7 +107,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = None @@ -142,7 +141,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") if driver_provider is not None: @@ -173,7 +171,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('KERBEROS fails with image version <= 2.1') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = "install-gpu-agent=true" @@ -224,7 +221,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") @@ -301,7 +297,6 @@ def test_gpu_allocation(self, configuration, master_accelerator, and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = None @@ -344,7 +339,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf and self.getImageOs() == 'rocky' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') - unittest.expectedFailure(self) self.skipTest("known to fail") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in index a7c4d353f..c2960e00e 100644 --- a/templates/gpu/install_gpu_driver.sh.in +++ b/templates/gpu/install_gpu_driver.sh.in @@ -68,6 +68,7 @@ function exit_handler() { } function prepare_to_install(){ + prepare_spark_env prepare_common_env prepare_pip_env prepare_gpu_env