From 0deb438a35132f8ad5ecf60ca032e7056508d120 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 11 Oct 2024 07:42:24 -0700 Subject: [PATCH] [gpu] updates as result of rocky tests and running script with custom image generator (#1242) * [gpu] clean-up of sources.list and keyring file assertion * allow main to access dkms certs ; remove full upgrade ; tested sources.list cleanup function * return cert cleanup to its normal location * merged from custom-images/examples/secure-boot/install_gpu_driver.sh * only unhold systemd on debian12 where the build breaks otherwise --- gpu/install_gpu_driver.sh | 248 +++++++++++++++++++------------------- 1 file changed, 124 insertions(+), 124 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index af99effb9..10b1aa061 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -30,9 +30,10 @@ function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } function is_debian10() { is_debian && [[ "$(os_version)" == '10'* ]] ; } function is_debian11() { is_debian && [[ "$(os_version)" == '11'* ]] ; } function is_debian12() { is_debian && [[ "$(os_version)" == '12'* ]] ; } -function os_vercat() { if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' - elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' - else os_version ; fi ; } +function os_vercat() { set +x + if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' + elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' + else os_version ; fi ; set -x ; } function remove_old_backports { if is_debian12 ; then return ; fi @@ -55,18 +56,55 @@ function remove_old_backports { done } -function compare_versions_lte { - [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] -} +function compare_versions_lte { [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; } function compare_versions_lt() { [ "$1" = "$2" ] && return 1 || compare_versions_lte $1 $2 } +function print_metadata_value() { + local readonly tmpfile=$(mktemp) + http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ + -s -o ${tmpfile} 2>/dev/null) + local readonly return_code=$? + # If the command completed successfully, print the metadata value to stdout. + if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then + cat ${tmpfile} + fi + rm -f ${tmpfile} + return ${return_code} +} + +function print_metadata_value_if_exists() { + local return_code=1 + local readonly url=$1 + print_metadata_value ${url} + return_code=$? + return ${return_code} +} + +function get_metadata_value() { + set +x + local readonly varname=$1 + local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 + # Print the instance metadata value. + print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + return_code=$? + # If the instance doesn't have the value, try the project. + if [[ ${return_code} != 0 ]]; then + print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + return_code=$? + fi + set -x + return ${return_code} +} + function get_metadata_attribute() { - local -r attribute_name=$1 + set +x + local -r attribute_name="$1" local -r default_value="${2:-}" - /usr/share/google/get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" + set -x } OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') @@ -98,7 +136,8 @@ readonly -A CUDA_SUBVER=( RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly DEFAULT_CUDA_VERSION='12.4' -readonly CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") +CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") +readonly CUDA_VERSION readonly CUDA_FULL_VERSION="${CUDA_SUBVER["${CUDA_VERSION}"]}" function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } @@ -218,11 +257,13 @@ MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 function execute_with_retries() { + set +x local -r cmd="$*" for ((i = 0; i < 3; i++)); do - if eval "$cmd"; then return 0 ; fi + if eval "$cmd"; then set -x ; return 0 ; fi sleep 5 done + set -x return 1 } @@ -336,8 +377,8 @@ function install_nvidia_nccl() { if is_rocky ; then time execute_with_retries \ - "dnf -y -q install" \ - "libnccl-${nccl_version} libnccl-devel-${nccl_version} libnccl-static-${nccl_version}" + dnf -y -q install \ + "libnccl-${nccl_version}" "libnccl-devel-${nccl_version}" "libnccl-static-${nccl_version}" elif is_ubuntu ; then install_cuda_keyring_pkg @@ -345,12 +386,12 @@ function install_nvidia_nccl() { if is_ubuntu18 ; then time execute_with_retries \ - "apt-get install -q -y " \ - "libnccl2 libnccl-dev" + apt-get install -q -y \ + libnccl2 libnccl-dev else time execute_with_retries \ - "apt-get install -q -y " \ - "libnccl2=${nccl_version} libnccl-dev=${nccl_version}" + apt-get install -q -y \ + "libnccl2=${nccl_version}" "libnccl-dev=${nccl_version}" fi else echo "Unsupported OS: '${OS_NAME}'" @@ -594,14 +635,14 @@ function build_driver_from_github() { 2> /var/log/open-gpu-kernel-modules-build_error.log if [[ -n "${PSN}" ]]; then - configure_dkms_certs + #configure_dkms_certs for module in $(find kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done - clear_dkms_key + #clear_dkms_key fi make modules_install \ @@ -629,14 +670,18 @@ function build_driver_from_packages() { add_contrib_component apt-get update -qq execute_with_retries "apt-get install -y -qq --no-install-recommends dkms" - configure_dkms_certs + #configure_dkms_certs time execute_with_retries "apt-get install -y -qq --no-install-recommends ${pkglist[@]}" elif is_rocky ; then - configure_dkms_certs - time execute_with_retries "dnf -y -q module install nvidia-driver:${DRIVER}-open" + #configure_dkms_certs + if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then + echo "nvidia-driver:${DRIVER}-dkms installed successfully" + else + time execute_with_retries dnf -y -q module install 'nvidia-driver:latest' + fi fi - clear_dkms_key + #clear_dkms_key } function install_nvidia_userspace_runfile() { @@ -676,6 +721,7 @@ function install_cuda_toolkit() { } function install_drivers_aliases() { + if is_rocky ; then return ; fi if ! (is_debian12 || is_debian11) ; then return ; fi if (is_debian12 && is_cuda11) && is_src_nvidia ; then return ; fi # don't install on debian 12 / cuda11 with drivers from nvidia # Add a modprobe alias to prefer the open kernel modules @@ -692,7 +738,6 @@ function install_drivers_aliases() { } function load_kernel_module() { - modprobe -r nvidia || echo "unable to unload the nvidia module" # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" @@ -709,7 +754,7 @@ function install_nvidia_gpu_driver() { add_nonfree_components add_repo_nvidia_container_toolkit apt-get update -qq - configure_dkms_certs + #configure_dkms_certs apt-get -yq install \ nvidia-container-toolkit \ dkms \ @@ -718,7 +763,7 @@ function install_nvidia_gpu_driver() { nvidia-smi \ libglvnd0 \ libcuda1 - clear_dkms_key + #clear_dkms_key load_kernel_module elif is_ubuntu18 || is_debian10 || (is_debian12 && is_cuda11) ; then @@ -804,7 +849,7 @@ function set_hadoop_property() { } function configure_yarn() { - if [[ ! -f ${HADOOP_CONF_DIR}/resource-types.xml ]]; then + if [[ -d "${HADOOP_CONF_DIR}" && ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" fi set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' @@ -836,10 +881,14 @@ function configure_yarn_nodemanager() { # Fix local dirs access permissions local yarn_local_dirs=() + readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') - chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + + if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then + chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" + fi } function configure_gpu_exclusive_mode() { @@ -1074,22 +1123,35 @@ function main() { } function clean_up_sources_lists() { - local OS_CODENAME="$(. /etc/os-release; echo "${VERSION_CODENAME}")" - # # bigtop (primary) # - local dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" - local -r bigtop_repo_url="https://storage.googleapis.com/goog-dataproc-bigtop-repo-us-west4/2_2_deb12_20240606_230238-RC01" - local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" - rm -f "${bigtop_kr_path}" - curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ - "${bigtop_repo_url}/archive.key" | gpg --dearmor -o "${bigtop_kr_path}" - - cat >"${dataproc_repo_file}" < /etc/apt/sources.list.d/adoptium.list @@ -1114,16 +1176,21 @@ EOF rm -f "${docker_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ | gpg --dearmor -o "${docker_kr_path}" - echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) ${OS_CODENAME} stable" \ + echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} # - # google cloud + # google cloud + logging/monitoring # - if [[ -f /etc/apt/sources.list.d/google-cloud.list ]]; then + if ls /etc/apt/sources.list.d/google-cloud*.list ; then rm -f /usr/share/keyrings/cloud.google.gpg curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - sed -i -e 's:deb https:deb [signed_by=/usr/share/keyrings/cloud.google.gpg] https:g' /etc/apt/sources.list.d/google-cloud.list + for list in google-cloud google-cloud-logging google-cloud-monitoring ; do + list_file="/etc/apt/sources.list.d/${list}.list" + if [[ -f "${list_file}" ]]; then + sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" + fi + done fi # @@ -1133,21 +1200,7 @@ EOF rm -f /usr/share/keyrings/cran-r.gpg curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7' | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg - sed -i -e 's:deb http:deb [signed_by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list - fi - - # - # google cloud + logging/monitoring - # - if ls /etc/apt/sources.list.d/google-cloud*.list ; then - rm -f /usr/share/keyrings/cloud.google.gpg - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg - for list in google-cloud google-cloud-logging google-cloud-monitoring ; do - list_file="/etc/apt/sources.list.d/${list}.list" - if [[ -f "${list_file}" ]]; then - sed -i -e 's:deb https:deb [signed_by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" - fi - done + sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi # @@ -1155,79 +1208,26 @@ EOF # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg - mysql_pgp_key_armored="-----BEGIN PGP PUBLIC KEY BLOCK----- -Version: SKS 1.1.6 -Comment: Hostname: pgp.mit.edu - -mQINBGU2rNoBEACSi5t0nL6/Hj3d0PwsbdnbY+SqLUIZ3uWZQm6tsNhvTnahvPPZBGdl99iW -YTt2KmXp0KeN2s9pmLKkGAbacQP1RqzMFnoHawSMf0qTUVjAvhnI4+qzMDjTNSBq9fa3nHmO -YxownnrRkpiQUM/yD7/JmVENgwWb6akZeGYrXch9jd4XV3t8OD6TGzTedTki0TDNr6YZYhC7 -jUm9fK9Zs299pzOXSxRRNGd+3H9gbXizrBu4L/3lUrNf//rM7OvV9Ho7u9YYyAQ3L3+OABK9 -FKHNhrpi8Q0cbhvWkD4oCKJ+YZ54XrOG0YTg/YUAs5/3//FATI1sWdtLjJ5pSb0onV3LIbar -RTN8lC4Le/5kd3lcot9J8b3EMXL5p9OGW7wBfmNVRSUI74Vmwt+v9gyp0Hd0keRCUn8lo/1V -0YD9i92KsE+/IqoYTjnya/5kX41jB8vr1ebkHFuJ404+G6ETd0owwxq64jLIcsp/GBZHGU0R -KKAo9DRLH7rpQ7PVlnw8TDNlOtWt5EJlBXFcPL+NgWbqkADAyA/XSNeWlqonvPlYfmasnAHA -pMd9NhPQhC7hJTjCiAwG8UyWpV8Dj07DHFQ5xBbkTnKH2OrJtguPqSNYtTASbsWz09S8ujoT -DXFT17NbFM2dMIiq0a4VQB3SzH13H2io9Cbg/TzJrJGmwgoXgwARAQABtDZNeVNRTCBSZWxl -YXNlIEVuZ2luZWVyaW5nIDxteXNxbC1idWlsZEBvc3Mub3JhY2xlLmNvbT6JAlQEEwEIAD4W -IQS8pDQXw7SF3RKOxtS3s7eIqNN4XAUCZTas2gIbAwUJA8JnAAULCQgHAgYVCgkICwIEFgID -AQIeAQIXgAAKCRC3s7eIqNN4XLzoD/9PlpWtfHlI8eQTHwGsGIwFA+fgipyDElapHw3MO+K9 -VOEYRZCZSuBXHJe9kjGEVCGUDrfImvgTuNuqYmVUV+wyhP+w46W/cWVkqZKAW0hNp0TTvu3e -Dwap7gdk80VF24Y2Wo0bbiGkpPiPmB59oybGKaJ756JlKXIL4hTtK3/hjIPFnb64Ewe4YLZy -oJu0fQOyA8gXuBoalHhUQTbRpXI0XI3tpZiQemNbfBfJqXo6LP3/LgChAuOfHIQ8alvnhCwx -hNUSYGIRqx+BEbJw1X99Az8XvGcZ36VOQAZztkW7mEfH9NDPz7MXwoEvduc61xwlMvEsUIaS -fn6SGLFzWPClA98UMSJgF6sKb+JNoNbzKaZ8V5w13msLb/pq7hab72HH99XJbyKNliYj3+KA -3q0YLf+Hgt4Y4EhIJ8x2+g690Np7zJF4KXNFbi1BGloLGm78akY1rQlzpndKSpZq5KWw8FY/ -1PEXORezg/BPD3Etp0AVKff4YdrDlOkNB7zoHRfFHAvEuuqti8aMBrbRnRSG0xunMUOEhbYS -/wOOTl0g3bF9NpAkfU1Fun57N96Us2T9gKo9AiOY5DxMe+IrBg4zaydEOovgqNi2wbU0MOBQ -b23Puhj7ZCIXcpILvcx9ygjkONr75w+XQrFDNeux4Znzay3ibXtAPqEykPMZHsZ2sbkCDQRl -NqzaARAAsdvBo8WRqZ5WVVk6lReD8b6Zx83eJUkV254YX9zn5t8KDRjYOySwS75mJIaZLsv0 -YQjJk+5rt10tejyCrJIFo9CMvCmjUKtVbgmhfS5+fUDRrYCEZBBSa0Dvn68EBLiHugr+SPXF -6o1hXEUqdMCpB6oVp6X45JVQroCKIH5vsCtw2jU8S2/IjjV0V+E/zitGCiZaoZ1f6NG7ozyF -ep1CSAReZu/sssk0pCLlfCebRd9Rz3QjSrQhWYuJa+eJmiF4oahnpUGktxMD632I9aG+IMfj -tNJNtX32MbO+Se+cCtVc3cxSa/pR+89a3cb9IBA5tFF2Qoekhqo/1mmLi93Xn6uDUhl5tVxT -nB217dBT27tw+p0hjd9hXZRQbrIZUTyh3+8EMfmAjNSIeR+th86xRd9XFRr9EOqrydnALOUr -9cT7TfXWGEkFvn6ljQX7f4RvjJOTbc4jJgVFyu8K+VU6u1NnFJgDiNGsWvnYxAf7gDDbUSXE -uC2anhWvxPvpLGmsspngge4yl+3nv+UqZ9sm6LCebR/7UZ67tYz3p6xzAOVgYsYcxoIUuEZX -jHQtsYfTZZhrjUWBJ09jrMvlKUHLnS437SLbgoXVYZmcqwAWpVNOLZf+fFm4IE5aGBG5Dho2 -CZ6ujngW9Zkn98T1d4N0MEwwXa2V6T1ijzcqD7GApZUAEQEAAYkCPAQYAQgAJhYhBLykNBfD -tIXdEo7G1Lezt4io03hcBQJlNqzaAhsMBQkDwmcAAAoJELezt4io03hcXqMP/01aPT3A3Sg7 -oTQoHdCxj04ELkzrezNWGM+YwbSKrR2LoXR8zf2tBFzc2/Tl98V0+68f/eCvkvqCuOtq4392 -Ps23j9W3r5XG+GDOwDsx0gl0E+Qkw07pwdJctA6efsmnRkjF2YVO0N9MiJA1tc8NbNXpEEHJ -Z7F8Ri5cpQrGUz/AY0eae2b7QefyP4rpUELpMZPjc8Px39Fe1DzRbT+5E19TZbrpbwlSYs1i -CzS5YGFmpCRyZcLKXo3zS6N22+82cnRBSPPipiO6WaQawcVMlQO1SX0giB+3/DryfN9VuIYd -1EWCGQa3O0MVu6o5KVHwPgl9R1P6xPZhurkDpAd0b1s4fFxin+MdxwmG7RslZA9CXRPpzo7/ -fCMW8sYOH15DP+YfUckoEreBt+zezBxbIX2CGGWEV9v3UBXadRtwxYQ6sN9bqW4jm1b41vNA -17b6CVH6sVgtU3eN+5Y9an1e5jLD6kFYx+OIeqIIId/TEqwS61csY9aav4j4KLOZFCGNU0FV -ji7NQewSpepTcJwfJDOzmtiDP4vol1ApJGLRwZZZ9PB6wsOgDOoP6sr0YrDI/NNX2RyXXbgl -nQ1yJZVSH3/3eo6knG2qTthUKHCRDNKdy9Qqc1x4WWWtSRjh+zX8AvJK2q1rVLH2/3ilxe9w -cAZUlaj3id3TxquAlud4lWDz -=h5nH ------END PGP PUBLIC KEY BLOCK-----" -# curl 'https://dev.mysql.com/doc/refman/8.0/en/checking-gpg-signature.html' | \ -# perl -e '$c = join("",); $c =~ s/.+?(-----BEGIN PGP .* PUBLIC KEY BLOCK-----).+$/$1/ms; print $c, $/' | \ - echo "${mysql_pgp_key_armored}" | gpg --dearmor -o /usr/share/keyrings/mysql.gpg + curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ + gpg --dearmor -o /usr/share/keyrings/mysql.gpg sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi - mv /etc/apt/trusted.gpg /etc/apt/untrusted.gpg + if -f /etc/apt/trusted.gpg ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } if is_debian ; then clean_up_sources_lists apt-get update -fi -if is_debian12 ; then - export DEBIAN_FRONTEND="noninteractive" - echo "Begin full upgrade" - date - apt-get --yes -qq -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" full-upgrade - - date - echo "End full upgrade" - pkgs="$(apt-get -y full-upgrade 2>&1 | grep -A9 'The following packages have been kept back:' | grep '^ ')" - apt-get install -y --allow-change-held-packages -qq ${pkgs} + if is_debian12 ; then + apt-mark unhold systemd libsystemd0 ; fi fi +configure_dkms_certs + main + +clear_dkms_key + +df -h