From 81bd2f433c9a2c143497976908435255c1e2cda2 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 12:18:45 -0800
Subject: [PATCH] [gpu] Exercise new template-generated GPU driver installer

---
 cloudbuild/Dockerfile                  |   3 +-
 gpu/install_gpu_driver.sh              | 993 ++++++++++++-------------
 gpu/test_gpu.py                        |   6 -
 templates/gpu/install_gpu_driver.sh.in |   1 +
 4 files changed, 482 insertions(+), 521 deletions(-)
diff --git a/cloudbuild/Dockerfile b/cloudbuild/Dockerfile
index aebaffd84..644219305 100644
--- a/cloudbuild/Dockerfile
+++ b/cloudbuild/Dockerfile
@@ -22,7 +22,8 @@ RUN /usr/bin/curl -s https://bazel.build/bazel-release.pub.gpg | \
     dd of="${bazel_repo_file}" status=none && \
     apt-get update -qq
 RUN apt-get autoremove -y -qq > /dev/null 2>&1 && \
-    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} > /dev/null 2>&1 && \
+    apt-get install -y -qq default-jdk python3-setuptools bazel-${bazel_version} \
+                           libtemplate-perl > /dev/null 2>&1 && \
     apt-get clean
 
 # Set bazel-${bazel_version} as the default bazel alternative in this container
diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh
index 91ad4ede0..07929ca59 100644
--- a/gpu/install_gpu_driver.sh
+++ b/gpu/install_gpu_driver.sh
@@ -190,45 +190,6 @@ function set_hadoop_property() {
     --clobber
 }
 
-function configure_yarn_resources() {
-  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
-  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
-    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
-  fi
-  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
-
-  set_hadoop_property 'capacity-scheduler.xml' \
-    'yarn.scheduler.capacity.resource-calculator' \
-    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
-
-  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
-}
-
-# This configuration should be applied only if GPU is attached to the node
-function configure_yarn_nodemanager() {
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
-  set_hadoop_property 'yarn-site.xml' \
-    'yarn.nodemanager.container-executor.class' \
-    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
-  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
-
-  # Fix local dirs access permissions
-  local yarn_local_dirs=()
-
-  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
-    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
-    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
-
-  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
-    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
-  fi
-}
-
 function clean_up_sources_lists() {
   #
   # bigtop (primary)
@@ -364,7 +325,7 @@ function is_ramdisk() {
 function mount_ramdisk(){
   local free_mem
   free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
-  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi
+  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi
 
   # Write to a ramdisk instead of churning the persistent disk
 
@@ -396,79 +357,6 @@ function check_os() {
       exit 1
   fi
 
-  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
-  readonly SPARK_VERSION
-  if version_lt "${SPARK_VERSION}" "3.1" || \
-     version_ge "${SPARK_VERSION}" "4.0" ; then
-    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
-    exit 1
-  fi
-
-  # Detect dataproc image version
-  if (! test -v DATAPROC_IMAGE_VERSION) ; then
-    if test -v DATAPROC_VERSION ; then
-      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
-    else
-      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
-      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
-      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
-      else echo "Unknown dataproc image version" ; exit 1 ; fi
-    fi
-  fi
-}
-
-#
-# Generate repo file under /etc/apt/sources.list.d/
-#
-function apt_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local -r include_src="${4:-yes}"
-  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
-
-  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
-  if [[ "${include_src}" == "yes" ]] ; then
-    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
-  fi
-
-  apt-get update -qq
-}
-
-#
-# Generate repo file under /etc/yum.repos.d/
-#
-function dnf_add_repo() {
-  local -r repo_name="$1"
-  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
-  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
-  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
-
-  curl -s -L "${repo_url}" \
-    | dd of="${repo_path}" status=progress
-#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
-}
-
-#
-# Keyrings default to
-# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
-# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
-#
-function os_add_repo() {
-  local -r repo_name="$1"
-  local -r signing_key_url="$2"
-  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
-  local kr_path
-  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
-                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
-
-  mkdir -p "$(dirname "${kr_path}")"
-
-  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
-    | gpg --import --no-default-keyring --keyring "${kr_path}"
-
-  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
-                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
 }
 
 function configure_dkms_certs() {
@@ -581,19 +469,35 @@ function restart_knox() {
   systemctl start knox
 }
 
+function is_complete() {
+  phase="$1"
+  test -f "${workdir}/complete/${phase}"
+}
+
+function mark_complete() {
+  phase="$1"
+  touch "${workdir}/complete/${phase}"
+}
+
+function mark_incomplete() {
+  phase="$1"
+  rm -f "${workdir}/complete/${phase}"
+}
+
 function install_dependencies() {
-  test -f "${workdir}/complete/install-dependencies" && return 0
+  is_complete install-dependencies && return 0
+
   pkg_list="screen"
   if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
   elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
-  touch "${workdir}/complete/install-dependencies"
+  mark_complete install-dependencies
 }
 
 function prepare_pip_env() {
   # Clear pip cache
   # TODO: make this conditional on which OSs have pip without cache purge
-  test -d "${tmpdir}/python-venv" || python3 -m venv "${tmpdir}/python-venv"
-  source "${tmpdir}/python-venv/bin/activate"
+  test -d "${workdir}/python-venv" || python3 -m venv "${workdir}/python-venv"
+  source "${workdir}/python-venv/bin/activate"
 
   pip cache purge || echo "unable to purge pip cache"
   if is_ramdisk ; then
@@ -603,44 +507,42 @@ function prepare_pip_env() {
   fi
 }
 
+function prepare_conda_env() {
+  CONDA=/opt/conda/miniconda3/bin/conda
+  touch ~/.condarc
+  cp ~/.condarc ~/.condarc.default
+  if is_ramdisk ; then
+    # Download conda packages to tmpfs
+    mkdir -p "${tmpdir}/conda_cache"
+    ${CONDA} config --add pkgs_dirs "${tmpdir}/conda_cache"
+  fi
+}
 
 function prepare_common_env() {
-  define_os_comparison_functions
-
   # Verify OS compatability and Secure boot state
   check_os
   check_secure_boot
 
-  readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
-
-  # Dataproc configurations
-  readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
-  readonly HIVE_CONF_DIR='/etc/hive/conf'
-  readonly SPARK_CONF_DIR='/etc/spark/conf'
-
+  # read-only configuration variables
+  _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"
+  HADOOP_CONF_DIR='/etc/hadoop/conf'
+  HIVE_CONF_DIR='/etc/hive/conf'
   OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
-  readonly OS_NAME
-
-  # node role
   ROLE="$(get_metadata_attribute dataproc-role)"
-  readonly ROLE
-
-  # master node
   MASTER="$(get_metadata_attribute dataproc-master)"
-  readonly MASTER
-
   workdir=/opt/install-dpgce
-  tmpdir=/tmp/
   temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
-  readonly temp_bucket
-  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
+  pkg_bucket="gs://${temp_bucket}/dpgce-packages"
   uname_r=$(uname -r)
-  readonly uname_r
-  readonly bdcfg="/usr/local/bin/bdconfig"
-  export DEBIAN_FRONTEND=noninteractive
+  bdcfg="/usr/local/bin/bdconfig"
+  KNOX_HOME=/usr/lib/knox
 
-  # Knox config
-  readonly KNOX_HOME=/usr/lib/knox
+  readonly HADOOP_CONF_DIR HIVE_CONF_DIR OS_NAME ROLE MASTER workdir
+  readonly temp_bucket pkg_bucket uname_r bdconfig KNOX_HOME
+
+  tmpdir=/tmp/
+
+  export DEBIAN_FRONTEND=noninteractive
 
   mkdir -p "${workdir}/complete"
   set_proxy
@@ -648,7 +550,7 @@ function prepare_common_env() {
 
   readonly install_log="${tmpdir}/install.log"
 
-  if test -f "${workdir}/complete/prepare.common" ; then return ; fi
+  is_complete prepare.common && return
 
   repair_old_backports
 
@@ -666,23 +568,24 @@ function prepare_common_env() {
     dnf clean all
   fi
 
-  # zero free disk space
-  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
+  # When creating a disk image:
+  if [[ -n "$(get_metadata_attribute creating-image "")" ]]; then
+    df / > "/run/disk-usage.log"
 
- ( set +e
+  # zero free disk space
+  ( set +e
     time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
   )
 
     install_dependencies
 
     # Monitor disk usage in a screen session
-    df / > "/run/disk-usage.log"
     touch "/run/keep-running-df"
     screen -d -m -LUS keep-running-df \
       bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"
  fi
 
-  touch "${workdir}/complete/prepare.common"
+  mark_complete prepare.common
 }
 
 function pip_exit_handler() {
@@ -692,29 +595,22 @@ function pip_exit_handler() {
   fi
 }
 
+function conda_exit_handler() {
+  mv ~/.condarc.default ~/.condarc
+}
+
 function common_exit_handler() {
   set +ex
   echo "Exit handler invoked"
 
-  # Restart YARN services if they are running already
-  for svc in resourcemanager nodemanager; do
-    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
-      systemctl  stop "hadoop-yarn-${svc}.service"
-      systemctl start "hadoop-yarn-${svc}.service"
-    fi
-  done
-
   # If system memory was sufficient to mount memory-backed filesystems
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
+  if is_ramdisk ; then
     # Clean up shared memory mounts
     for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp ; do
       if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
         umount -f ${shmdir}
       fi
     done
-
-    # restart services stopped during preparation stage
-    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
   fi
 
   if is_debuntu ; then
@@ -787,6 +683,63 @@ print( "    samples-taken: ", scalar @siz, $/,
   echo "exit_handler has completed"
 }
 
+define_os_comparison_functions
+
+
+#
+# Generate repo file under /etc/apt/sources.list.d/
+#
+function apt_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local -r include_src="${4:-yes}"
+  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"
+
+  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
+  if [[ "${include_src}" == "yes" ]] ; then
+    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
+  fi
+
+  apt-get update -qq
+}
+
+#
+# Generate repo file under /etc/yum.repos.d/
+#
+function dnf_add_repo() {
+  local -r repo_name="$1"
+  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
+  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
+  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"
+
+  curl -s -L "${repo_url}" \
+    | dd of="${repo_path}" status=progress
+#    | perl -p -e "s{^gpgkey=.*$}{gpgkey=file://${kr_path}}" \
+}
+
+#
+# Keyrings default to
+# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
+# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
+#
+function os_add_repo() {
+  local -r repo_name="$1"
+  local -r signing_key_url="$2"
+  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
+  local kr_path
+  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
+                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi
+
+  mkdir -p "$(dirname "${kr_path}")"
+
+  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${signing_key_url}" \
+    | gpg --import --no-default-keyring --keyring "${kr_path}"
+
+  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
+                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
+}
+
 
 function set_support_matrix() {
   # CUDA version and Driver version
@@ -839,8 +792,6 @@ function set_support_matrix() {
   )
 }
 
-set_support_matrix
-
 function set_cuda_version() {
   case "${DATAPROC_IMAGE_VERSION}" in
     "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
@@ -930,50 +881,133 @@ function set_driver_version() {
   fi
 }
 
-function set_cudnn_version() {
-  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
-  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
-
-  # Parameters for NVIDIA-provided cuDNN library
-  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
-  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
-  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
-  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
-    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
-  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
-    # cuDNN v8 is not distribution for ubuntu20+, debian12
-    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
-  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
-    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
-    CUDNN_VERSION="8.8.0.121"
-  fi
-  readonly CUDNN_VERSION
-}
+function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
+function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
 
+function nvsmi() {
+  local nvsmi="/usr/bin/nvidia-smi"
+  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
+  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
+  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
+  else nvsmi_works="1" ; fi
 
-function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
-function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+  if test -v 1 && [[ "$1" == "-L" ]] ; then
+    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
+    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
+    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
 
-function set_cuda_repo_shortname() {
-# Short name for urls
-# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
-  if is_rocky ; then
-    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
-  else
-    shortname="$(os_id)$(os_vercat)"
+    return 0
   fi
-}
-
-function set_nv_urls() {
-  # Parameters for NVIDIA-provided package repositories
-  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
-  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
 
-  # Parameter for NVIDIA-provided Rocky Linux GPU driver
-  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+  "${nvsmi}" $*
 }
 
-function set_cuda_runfile_url() {
+function clear_nvsmi_cache() {
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
+    rm "${nvsmi_query_xml}"
+  fi
+}
+
+function query_nvsmi() {
+  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
+  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
+  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
+}
+
+function prepare_gpu_env(){
+  set_support_matrix
+
+  set_cuda_version
+  set_driver_version
+
+  set +e
+  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
+  set -e
+  echo "gpu_count=[${gpu_count}]"
+  nvsmi_works="0"
+  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
+  xmllint="/opt/conda/miniconda3/bin/xmllint"
+  NVIDIA_SMI_PATH='/usr/bin'
+  MIG_MAJOR_CAPS=0
+  IS_MIG_ENABLED=0
+  CUDNN_PKG_NAME=""
+  CUDNN8_PKG_NAME=""
+  CUDA_LOCAL_REPO_INSTALLED="0"
+
+  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
+    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
+  fi
+
+  # Verify SPARK compatability
+  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' "${DEFAULT_RAPIDS_RUNTIME}")
+  readonly RAPIDS_RUNTIME
+
+  # determine whether we have nvidia-smi installed and working
+  nvsmi
+}
+
+# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
+# Users should run apt-mark unhold before they wish to upgrade these packages
+function hold_nvidia_packages() {
+  if ! is_debuntu ; then return ; fi
+
+  apt-mark hold nvidia-*
+  apt-mark hold libnvidia-*
+  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
+    apt-mark hold xserver-xorg-video-nvidia*
+  fi
+}
+
+function gpu_exit_handler() {
+  echo "no operations in gpu exit handler"
+}
+
+
+function set_cudnn_version() {
+  readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
+  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"
+
+  # Parameters for NVIDIA-provided cuDNN library
+  DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
+  readonly DEFAULT_CUDNN_VERSION
+  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
+  # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
+  if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
+    CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
+  elif (ge_ubuntu20 || ge_debian12) && [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; then
+    # cuDNN v8 is not distribution for ubuntu20+, debian12
+    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
+  elif (le_ubuntu18 || le_debian11) && [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; then
+    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
+    CUDNN_VERSION="8.8.0.121"
+  fi
+  readonly CUDNN_VERSION
+}
+
+
+function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
+function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
+
+function set_cuda_repo_shortname() {
+# Short name for urls
+# https://developer.download.nvidia.com/compute/cuda/repos/${shortname}
+  if is_rocky ; then
+    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
+  else
+    shortname="$(os_id)$(os_vercat)"
+  fi
+}
+
+function set_nv_urls() {
+  # Parameters for NVIDIA-provided package repositories
+  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
+  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"
+
+  # Parameter for NVIDIA-provided Rocky Linux GPU driver
+  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
+}
+
+function set_cuda_runfile_url() {
   local MAX_DRIVER_VERSION
   local MAX_CUDA_VERSION
 
@@ -1095,7 +1129,7 @@ function uninstall_cuda_keyring_pkg() {
 }
 
 function install_local_cuda_repo() {
-  if test -f "${workdir}/complete/install-local-cuda-repo" ; then return ; fi
+  is_complete install-local-cuda-repo && return
 
   if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
   CUDA_LOCAL_REPO_INSTALLED="1"
@@ -1118,7 +1152,7 @@ function install_local_cuda_repo() {
       -o /etc/apt/preferences.d/cuda-repository-pin-600
   fi
 
-  touch "${workdir}/complete/install-local-cuda-repo"
+  mark_complete install-local-cuda-repo
 }
 function uninstall_local_cuda_repo(){
   apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
@@ -1126,7 +1160,8 @@ function uninstall_local_cuda_repo(){
 }
 
 function install_local_cudnn_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn-repo" ; then return ; fi
+  is_complete install-local-cudnn-repo && return
+
   pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
   CUDNN_PKG_NAME="${pkgname}"
   local_deb_fn="${pkgname}_1.0-1_amd64.deb"
@@ -1142,7 +1177,7 @@ function install_local_cudnn_repo() {
 
   cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings
 
-  touch "${workdir}/complete/install-local-cudnn-repo"
+  mark_complete install-local-cudnn-repo
 }
 
 function uninstall_local_cudnn_repo() {
@@ -1151,7 +1186,7 @@ function uninstall_local_cudnn_repo() {
 }
 
 function install_local_cudnn8_repo() {
-  if test -f "${workdir}/complete/install-local-cudnn8-repo" ; then return ; fi
+  is_complete install-local-cudnn8-repo && return
 
   if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
   elif is_debian ; then cudnn8_shortname="debian11"
@@ -1185,19 +1220,19 @@ function install_local_cudnn8_repo() {
   rm -f "${local_deb_fn}"
 
   cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
-  touch "${workdir}/complete/install-local-cudnn8-repo"
+  mark_complete install-local-cudnn8-repo
 }
 
 function uninstall_local_cudnn8_repo() {
   apt-get purge -yq "${CUDNN8_PKG_NAME}"
-  rm -f "${workdir}/complete/install-local-cudnn8-repo"
+  mark_incomplete install-local-cudnn8-repo
 }
 
 function install_nvidia_nccl() {
   readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
   readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})
 
-  if test -f "${workdir}/complete/nccl" ; then return ; fi
+  is_complete nccl && return
 
   if is_cuda11 && is_debian12 ; then
     echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
@@ -1288,14 +1323,12 @@ function install_nvidia_nccl() {
   fi
 
   popd
-  touch "${workdir}/complete/nccl"
+  mark_complete nccl
 }
 
-function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
-function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )
-
 function install_nvidia_cudnn() {
-  if test -f "${workdir}/complete/cudnn" ; then return ; fi
+  is_complete cudnn && return
+
   local major_version
   major_version="${CUDNN_VERSION%%.*}"
   local cudnn_pkg_version
@@ -1354,7 +1387,7 @@ function install_nvidia_cudnn() {
   ldconfig
 
   echo "NVIDIA cuDNN successfully installed for ${_shortname}."
-  touch "${workdir}/complete/cudnn"
+  mark_complete cudnn
 }
 
 function add_nonfree_components() {
@@ -1511,7 +1544,8 @@ function install_nvidia_userspace_runfile() {
   #
   # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
   # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
-  if test -f "${workdir}/complete/userspace" ; then return ; fi
+  is_complete userspace && return
+
   local local_fn="${tmpdir}/userspace.run"
 
   cache_fetched_package "${USERSPACE_URL}" \
@@ -1569,7 +1603,7 @@ function install_nvidia_userspace_runfile() {
       depmod -a
     else
       clear_dkms_key
-      tar czvf "${local_tarball}" \
+      tar czf "${local_tarball}" \
         /var/log/nvidia-installer.log \
         $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
       gcloud storage cp "${local_tarball}" "${gcs_tarball}"
@@ -1577,12 +1611,13 @@ function install_nvidia_userspace_runfile() {
   fi
 
   rm -f "${local_fn}"
-  touch "${workdir}/complete/userspace"
+  mark_complete userspace
   sync
 }
 
 function install_cuda_runfile() {
-  if test -f "${workdir}/complete/cuda" ; then return ; fi
+  is_complete cuda && return
+
   local local_fn="${tmpdir}/cuda.run"
 
   cache_fetched_package "${NVIDIA_CUDA_URL}" \
@@ -1591,7 +1626,7 @@ function install_cuda_runfile() {
 
   execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
   rm -f "${local_fn}"
-  touch "${workdir}/complete/cuda"
+  mark_complete cuda
   sync
 }
 
@@ -1615,9 +1650,12 @@ function install_cuda_toolkit() {
 }
 
 function load_kernel_module() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
   for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
-    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    ( set +e
+      rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
+    )
   done
 
   depmod -a
@@ -1629,7 +1667,8 @@ function load_kernel_module() {
 }
 
 function install_cuda(){
-  if test -f "${workdir}/complete/cuda-repo" ; then return ; fi
+  is_complete cuda-repo && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     echo "installed with the driver on ${_shortname}"
@@ -1642,10 +1681,12 @@ function install_cuda(){
   # Includes CUDA packages
   add_repo_cuda
 
-  touch "${workdir}/complete/cuda-repo"
+  mark_complete cuda-repo
 }
 
 function install_nvidia_container_toolkit() {
+  is_complete install-nvtk && return
+
   local container_runtime_default
     if command -v docker     ; then container_runtime_default='docker'
   elif command -v containerd ; then container_runtime_default='containerd'
@@ -1661,11 +1702,14 @@ function install_nvidia_container_toolkit() {
     execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
   nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
   systemctl restart "${CONTAINER_RUNTIME}"
+
+  mark_complete install-nvtk
 }
 
 # Install NVIDIA GPU driver provided by NVIDIA
 function install_nvidia_gpu_driver() {
-  if test -f "${workdir}/complete/gpu-driver" ; then return ; fi
+  is_complete gpu-driver && return
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
 
   if ( ge_debian12 && is_src_os ) ; then
     add_nonfree_components
@@ -1687,11 +1731,11 @@ function install_nvidia_gpu_driver() {
   build_driver_from_github
 
   echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
-  touch "${workdir}/complete/gpu-driver"
+  mark_complete gpu-driver
 }
 
 function install_ops_agent(){
-  if test -f "${workdir}/complete/ops-agent" ; then return ; fi
+  is_complete ops-agent && return
 
   mkdir -p /opt/google
   cd /opt/google
@@ -1699,11 +1743,12 @@ function install_ops_agent(){
   curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
   execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install
 
-  touch "${workdir}/complete/ops-agent"
+  is_complete ops-agent
 }
 
 # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
 function install_gpu_monitoring_agent() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   download_gpu_monitoring_agent
   install_gpu_monitoring_agent_dependency
   start_gpu_monitoring_agent_service
@@ -1724,9 +1769,12 @@ function download_gpu_monitoring_agent(){
 
 function install_gpu_monitoring_agent_dependency(){
   cd /opt/google/compute-gpu-monitoring/linux
-  python3 -m venv venv
-  venv/bin/pip install wheel
-  venv/bin/pip install -Ur requirements.txt
+  /opt/conda/miniconda3/bin/python3 -m venv venv
+  (
+    source venv/bin/activate
+    pip install wheel
+    pip install -Ur requirements.txt
+  )
 }
 
 function start_gpu_monitoring_agent_service(){
@@ -1752,7 +1800,7 @@ function install_gpu_agent() {
     | sed -e 's/-u --format=/--format=/' \
     | dd status=none of="${install_dir}/report_gpu_metrics.py"
   local venv="${install_dir}/venv"
-  python3 -m venv "${venv}"
+  /opt/conda/miniconda3/bin/python3 -m venv "${venv}"
 (
   source "${venv}/bin/activate"
   python3 -m pip install --upgrade pip
@@ -1784,6 +1832,7 @@ EOF
 }
 
 function configure_gpu_exclusive_mode() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # only run this function when spark < 3.0
   if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
   # include exclusive mode on GPU
@@ -1791,52 +1840,139 @@ function configure_gpu_exclusive_mode() {
   clear_nvsmi_cache
 }
 
-function fetch_mig_scripts() {
-  mkdir -p /usr/local/yarn-mig-scripts
-  chmod 755 /usr/local/yarn-mig-scripts
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
-  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
-  chmod 755 /usr/local/yarn-mig-scripts/*
+function install_build_dependencies() {
+  is_complete build-dependencies && return
+
+  if is_debuntu ; then
+    if is_ubuntu22 && is_cuda12 ; then
+      # On ubuntu22, the default compiler does not build some kernel module versions
+      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
+      execute_with_retries apt-get install -y -qq gcc-12
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
+      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
+      update-alternatives --set gcc /usr/bin/gcc-12
+    fi
+
+  elif is_rocky ; then
+    execute_with_retries dnf -y -q install gcc
+
+    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
+    set +e
+    eval "${dnf_cmd}" > "${install_log}" 2>&1
+    local retval="$?"
+    set -e
+
+    if [[ "${retval}" == "0" ]] ; then return ; fi
+
+    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
+      # this kernel-devel may have been migrated to the vault
+      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
+      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
+      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
+        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
+        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
+       )"
+    fi
+
+    execute_with_retries "${dnf_cmd}"
+  fi
+  mark_complete build-dependencies
 }
 
-function install_spark_rapids() {
-  # Update SPARK RAPIDS config
-  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
-  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
+function install_gpu_driver_and_cuda() {
+  install_nvidia_gpu_driver
+  install_cuda
+  load_kernel_module
+}
 
-  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
-  local -r scala_ver="2.12"
+function prepare_gpu_install_env() {
+  # Whether to install NVIDIA-provided or OS-provided GPU driver
+  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
+  readonly GPU_DRIVER_PROVIDER
 
-  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
-    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
+  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
+  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
+  readonly INSTALL_GPU_AGENT
+
+  set_cuda_repo_shortname
+  set_nv_urls
+  set_cuda_runfile_url
+  set_cudnn_version
+  set_cudnn_tarball_url
+
+  if   is_cuda11 ; then gcc_ver="11"
+  elif is_cuda12 ; then gcc_ver="12" ; fi
+}
+
+function gpu_install_exit_handler() {
+  if is_ramdisk ; then
+    for shmdir in /var/cudnn-local ; do
+      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
+        umount -f ${shmdir}
+      fi
+    done
   fi
+  hold_nvidia_packages
+}
 
-  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
-  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
-  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
-  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
-  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
+# This configuration should be applied only if GPU is attached to the node
+function configure_yarn_nodemanager() {
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
+  set_hadoop_property 'yarn-site.xml' \
+    'yarn.nodemanager.container-executor.class' \
+    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
+  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'
 
-  local jar_basename
+  # Fix local dirs access permissions
+  local yarn_local_dirs=()
 
-  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
+    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
+    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')
 
-  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
-  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
+    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
+  fi
+}
 
-  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
-  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
-                        "/usr/lib/spark/jars/${jar_basename}"
+function yarn_exit_handler() {
+  # Restart YARN services if they are running already
+  for svc in resourcemanager nodemanager; do
+    if [[ "$(systemctl show hadoop-yarn-${svc}.service -p SubState --value)" == 'running' ]]; then
+      systemctl  stop "hadoop-yarn-${svc}.service"
+      systemctl start "hadoop-yarn-${svc}.service"
+    fi
+  done
+  # restart services stopped during preparation stage
+  # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
+}
+
+
+function configure_yarn_gpu_resources() {
+  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
+  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
+    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
+  fi
+  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'
+
+  set_hadoop_property 'capacity-scheduler.xml' \
+    'yarn.scheduler.capacity.resource-calculator' \
+    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'
+
+  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
 }
 
 function configure_gpu_script() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # Download GPU discovery script
   local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
   mkdir -p ${spark_gpu_script_dir}
@@ -1911,6 +2047,7 @@ EOF
 }
 
 function configure_yarn_nodemanager_gpu() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
   set_hadoop_property 'yarn-site.xml' \
     'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
@@ -1920,6 +2057,7 @@ function configure_yarn_nodemanager_gpu() {
 }
 
 function configure_gpu_isolation() {
+  if [[ "${gpu_count}" == "0" ]] ; then return ; fi
   # enable GPU isolation
   sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
   if [[ $IS_MIG_ENABLED -ne 0 ]]; then
@@ -1947,292 +2085,115 @@ EOF
   systemctl start dataproc-cgroup-device-permissions
 }
 
-function nvsmi() {
-  local nvsmi="/usr/bin/nvidia-smi"
-  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
-  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
-  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
-  else nvsmi_works="1" ; fi
-
-  if test -v 1 && [[ "$1" == "-L" ]] ; then
-    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
-    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
-    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
+function setup_gpu_yarn() {
+  # This configuration should be run on all nodes
+  # regardless if they have attached GPUs
+  configure_yarn_gpu_resources
 
+  # When there is no GPU, but the installer is executing on a master node:
+  if [[ "${gpu_count}" == "0" ]] ; then
+    if [[ "${ROLE}" == "Master" ]]; then
+      configure_yarn_nodemanager
+    fi
     return 0
   fi
 
-  "${nvsmi}" $*
-}
-
-function clear_nvsmi_cache() {
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then
-    rm "${nvsmi_query_xml}"
-  fi
-}
-
-function query_nvsmi() {
-  if [[ "${nvsmi_works}" != "1" ]] ; then return ; fi
-  if ( test -v nvsmi_query_xml && test -f "${nvsmi_query_xml}" ) ; then return ; fi
-  nvsmi -q -x --dtd > "${nvsmi_query_xml}"
-}
-
-function install_build_dependencies() {
-  if test -f "${workdir}/complete/build-dependencies" ; then return ; fi
-
-  if is_debuntu ; then
-    if is_ubuntu22 && is_cuda12 ; then
-      # On ubuntu22, the default compiler does not build some kernel module versions
-      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
-      execute_with_retries apt-get install -y -qq gcc-12
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
-      update-alternatives --set gcc /usr/bin/gcc-12
-    fi
-
-  elif is_rocky ; then
-    execute_with_retries dnf -y -q install gcc
-
-    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
-    set +e
-    eval "${dnf_cmd}" > "${install_log}" 2>&1
-    local retval="$?"
-    set -e
-
-    if [[ "${retval}" == "0" ]] ; then return ; fi
-
-    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
-      # this kernel-devel may have been migrated to the vault
-      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
-      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
-      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
-        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
-        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
-       )"
-    fi
-
-    execute_with_retries "${dnf_cmd}"
+  install_nvidia_container_toolkit
+  configure_yarn_nodemanager_gpu
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    configure_gpu_script
   fi
-  touch "${workdir}/complete/build-dependencies"
-}
-
-function prepare_gpu_env(){
-  set +e
-  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
-  set -e
-  echo "gpu_count=[${gpu_count}]"
-  nvsmi_works="0"
-  nvsmi_query_xml="${tmpdir}/nvsmi.xml"
-  xmllint="/opt/conda/miniconda3/bin/xmllint"
-  NVIDIA_SMI_PATH='/usr/bin'
-  MIG_MAJOR_CAPS=0
-  IS_MIG_ENABLED=0
-  CUDNN_PKG_NAME=""
-  CUDNN8_PKG_NAME=""
-  CUDA_LOCAL_REPO_INSTALLED="0"
-
-  # Whether to install NVIDIA-provided or OS-provided GPU driver
-  GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
-  readonly GPU_DRIVER_PROVIDER
-
-  # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
-  INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
-  readonly INSTALL_GPU_AGENT
-
-  # Verify SPARK compatability
-  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
-  readonly RAPIDS_RUNTIME
-
-  # determine whether we have nvidia-smi installed and working
-  nvsmi
-
-  set_cuda_version
-  set_driver_version
-  set_cuda_repo_shortname
-  set_nv_urls
-  set_cuda_runfile_url
-  set_cudnn_version
-  set_cudnn_tarball_url
-
-  if   is_cuda11 ; then gcc_ver="11"
-  elif is_cuda12 ; then gcc_ver="12" ; fi
+  configure_gpu_isolation
 }
 
-# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
-# Users should run apt-mark unhold before they wish to upgrade these packages
-function hold_nvidia_packages() {
-  if ! is_debuntu ; then return ; fi
 
-  apt-mark hold nvidia-*
-  apt-mark hold libnvidia-*
-  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
-    apt-mark hold xserver-xorg-video-nvidia*
-  fi
+function download_spark_jar() {
+  local -r url=$1
+  local -r jar_name=${url##*/}
+  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
+    "${url}" -o "${SPARK_JARS_DIR}/${jar_name}"
 }
 
-function delete_mig_instances() (
-  # delete all instances
-  set +e
-  nvidia-smi mig -dci
-
-  case "${?}" in
-    "0" ) echo "compute instances deleted"            ;;
-    "2" ) echo "invalid argument"                     ;;
-    "6" ) echo "No compute instances found to delete" ;;
-    *   ) echo "unrecognized return code"             ;;
-  esac
+function install_spark_rapids() {
+  # Update SPARK RAPIDS config
+  local DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
+  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3
 
-  nvidia-smi mig -dgi
-  case "${?}" in
-    "0" ) echo "compute instances deleted"        ;;
-    "2" ) echo "invalid argument"                 ;;
-    "6" ) echo "No GPU instances found to delete" ;;
-    *   ) echo "unrecognized return code"         ;;
-  esac
-)
+  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
+  local -r scala_ver="2.12"
 
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html#configuring-mig-profiles
-function configure_mig_cgi() {
-  delete_mig_instances
-  META_MIG_CGI_VALUE="$(get_metadata_attribute 'MIG_CGI')"
-  if test -n "${META_MIG_CGI_VALUE}"; then
-    nvidia-smi mig -cgi "${META_MIG_CGI_VALUE}" -C
-  else
-    # https://pci-ids.ucw.cz/v2.2/pci.ids
-    local pci_id_list="$(grep -iH PCI_ID=10DE /sys/bus/pci/devices/*/uevent)"
-    if echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:23' ; then
-      # run the following command to list placement profiles
-      # nvidia-smi mig -lgipp
-      #
-      # This is the result when using H100 instances on 20241220
-      # GPU  0 Profile ID 19 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 20 Placements: {0,1,2,3,4,5,6}:1
-      # GPU  0 Profile ID 15 Placements: {0,2,4,6}:2
-      # GPU  0 Profile ID 14 Placements: {0,2,4}:2
-      # GPU  0 Profile ID  9 Placements: {0,4}:4
-      # GPU  0 Profile ID  5 Placement : {0}:4
-      # GPU  0 Profile ID  0 Placement : {0}:8
-
-      # For H100 3D controllers, consider profile 19, 7x1G instances
-      nvidia-smi mig -cgi 9,9 -C
-    elif echo "${pci_id_list}" | grep -q -i 'PCI_ID=10DE:20' ; then
-      # Dataproc only supports H100s right now ; split in 2 if not specified
-      # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#creating-gpu-instances
-      nvidia-smi mig -cgi 9,9 -C
-    else
-      echo "unrecognized 3D controller"
-    fi
+  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
+    local DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
   fi
-  clear_nvsmi_cache
-}
-
-function enable_mig() {
-  if test -f "${workdir}/complete/enable-mig" ; then return ; fi
 
-  # Start persistenced if it's not already running
-  if ! ( ps auwx | grep -i nvidia\\-persistenced ) ; then ( nvidia-persistenced & ) ; fi
-  for f in /sys/module/nvidia/drivers/pci:nvidia/*/numa_node ; do
-    # Write an ascii zero to the numa node indicator
-    echo "0" | dd of="${f}" status=none
-  done
-  time nvsmi --gpu-reset # 30s
-  nvsmi -mig 1
-  clear_nvsmi_cache
-
-  touch "${workdir}/complete/enable-mig"
-}
+  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
+  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})
 
-function enable_and_configure_mig() {
-  # default MIG to on when this script is used
-  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")
+  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
+  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
+  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'
 
-  if [[ ${META_MIG_VALUE} -eq 0 ]]; then echo "Not enabling MIG" ; return ; fi
+  local jar_basename
 
-  enable_mig
-  query_nvsmi
-  local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-  mig_mode_current="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}")"
+  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 
-  if [[ "$(echo "${mig_mode_current}" | uniq | wc -l)" -ne "1" ]] ; then echo "MIG is NOT enabled on all on GPUs.  Failing" ; exit 1 ; fi
-  if ! (echo "${mig_mode_current}" | grep Enabled)                ; then echo "MIG is configured but NOT enabled.  Failing" ; exit 1 ; fi
+  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
+  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 
-  echo "MIG is fully enabled"
-  configure_mig_cgi
+  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
+  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
+                        "/usr/lib/spark/jars/${jar_basename}"
 }
 
-function setup_gpu_yarn() {
-  # This configuration should be run on all nodes
-  # regardless if they have attached GPUs
-  configure_yarn_resources
+function prepare_spark_env() {
+  SPARK_NLP_VERSION="3.2.1" # Must include subminor version here
+  SPARK_JARS_DIR=/usr/lib/spark/jars
+  SPARK_CONF_DIR='/etc/spark/conf'
+  SPARK_BIGQUERY_VERSION="$(get_metadata_attribute spark-bigquery-connector-version "${DEFAULT_SPARK_BIGQUERY_VERSION:-0.22.0}")"
+  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
 
-  # When there is no GPU, but the installer is executing on a master node:
-  if [[ "${gpu_count}" == "0" ]] ; then
-    if [[ "${ROLE}" == "Master" ]]; then
-      configure_yarn_nodemanager
-    fi
-    return 0
-  fi
+  readonly SPARK_VERSION SPARK_BIGQUERY_VERSION SPARK_CONF_DIR SPARK_JARS_DIR SPARK_NLP_VERSION
 
-  if [[ "${nvsmi_works}" == "1" ]] ; then
-    # if this is called without the MIG script then the drivers are not installed
-    query_nvsmi
-    local xpath='//nvidia_smi_log/*/mig_mode/current_mig/text()'
-    set +e
-    migquery_result="$("${xmllint}" --xpath "${xpath}" "${nvsmi_query_xml}" | grep -v 'N/A')"
-    set -e
-    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
-
-    if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
-      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
-        if (echo "${migquery_result}" | grep Enabled); then
-          IS_MIG_ENABLED=1
-          NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
-          MIG_MAJOR_CAPS=`grep nvidia-caps /proc/devices | cut -d ' ' -f 1`
-          fetch_mig_scripts
-        fi
-      fi
-    fi
+  if version_lt "${SPARK_VERSION}" "3.1" || \
+     version_ge "${SPARK_VERSION}" "4.0" ; then
+    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
+    exit 1
   fi
 
-  # if mig is enabled drivers would have already been installed
-  if [[ $IS_MIG_ENABLED -eq 0 ]]; then
-    install_nvidia_gpu_driver
-    install_cuda
-    load_kernel_module
-
-    #Install GPU metrics collection in Stackdriver if needed
-    if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
-      install_gpu_agent
-#      install_gpu_monitoring_agent
-      echo 'GPU metrics agent successfully deployed.'
+  # Detect dataproc image version
+  if (! test -v DATAPROC_IMAGE_VERSION) ; then
+    if test -v DATAPROC_VERSION ; then
+      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
     else
-      echo 'GPU metrics agent has not been installed.'
+      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
+      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
+      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
+      else echo "Unknown dataproc image version" ; exit 1 ; fi
     fi
-    configure_gpu_exclusive_mode
   fi
 
-  install_nvidia_container_toolkit
-  configure_yarn_nodemanager_gpu
-  configure_gpu_script
-  configure_gpu_isolation
-}
-
-function gpu_exit_handler() {
-  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
-    for shmdir in /var/cudnn-local ; do
-      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
-        umount -f ${shmdir}
-      fi
-    done
-  fi
-  hold_nvidia_packages
 }
 
 
 function main() {
+  install_gpu_driver_and_cuda
+
+  #Install GPU metrics collection in Stackdriver if needed
+  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+    install_gpu_agent
+#    install_gpu_monitoring_agent
+    echo 'GPU metrics agent successfully deployed.'
+  else
+    echo 'GPU metrics agent has not been installed.'
+  fi
+  configure_gpu_exclusive_mode
+
   setup_gpu_yarn
 
   echo "yarn setup complete"
@@ -2258,8 +2219,11 @@ function main() {
 }
 
 function exit_handler() {
+  set +e
+  gpu_install_exit_handler
   gpu_exit_handler
   pip_exit_handler
+  yarn_exit_handler
   common_exit_handler
   return 0
 }
@@ -2268,6 +2232,7 @@ function prepare_to_install(){
   prepare_common_env
   prepare_pip_env
   prepare_gpu_env
+  prepare_gpu_install_env
   trap exit_handler EXIT
 }
 
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index 1f3328eaa..412d16ddb 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -107,7 +107,6 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -142,7 +141,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     if driver_provider is not None:
@@ -173,7 +171,6 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
     if configuration == 'KERBEROS' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('KERBEROS fails with image version <= 2.1')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
@@ -224,7 +221,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
 
@@ -301,7 +297,6 @@ def test_gpu_allocation(self, configuration, master_accelerator,
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = None
@@ -344,7 +339,6 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
     and self.getImageOs() == 'rocky' \
     and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
       # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
-      unittest.expectedFailure(self)
       self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
index a7c4d353f..c2960e00e 100644
--- a/templates/gpu/install_gpu_driver.sh.in
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -68,6 +68,7 @@ function exit_handler() {
 }
 
 function prepare_to_install(){
+  prepare_spark_env
   prepare_common_env
   prepare_pip_env
   prepare_gpu_env