From 808938917e43b8782a0b9c44f95c2e68568052af Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Wed, 25 Dec 2024 12:57:47 +0800 Subject: [PATCH] update rapids version for 24.10 release (#1248) * update v2410 rapids release Signed-off-by: liyuan * update the readme doc Signed-off-by: liyuan * update the readme doc Signed-off-by: liyuan * update v2412 version Signed-off-by: liyuan * do not recreate git clone on second pass * gathering timing data for some long-running sections of the installer * over-commitment on the disk space cleaned up --------- Signed-off-by: liyuan Co-authored-by: C.J. Collier --- spark-rapids/README.md | 19 +++---------------- spark-rapids/spark-rapids.sh | 14 ++++++++------ spark-rapids/test_spark_rapids.py | 6 +++--- 3 files changed, 14 insertions(+), 25 deletions(-) diff --git a/spark-rapids/README.md b/spark-rapids/README.md index 2e5863988..92f55fea3 100644 --- a/spark-rapids/README.md +++ b/spark-rapids/README.md @@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+. ## RAPIDS Accelerator For Apache Spark ### Prerequisites - -To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3 - -* Apache Spark 3.0+ -* Hardware Requirements - * NVIDIA Pascalâ„¢ GPU architecture or better (V100, P100, T4 and later) - * Multi-node clusters with homogenous GPU configuration -* Software Requirements - * NVIDIA GPU driver 440.33+ - * CUDA v11.5/v11.0/v10.2/v10.1 - * NCCL 2.11.4+ - * Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11 +Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/) +official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html). This section describes how to create [Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with @@ -59,11 +49,10 @@ export GCS_BUCKET= export REGION= export NUM_GPUS=1 export NUM_WORKERS=2 -export CUDA_VER=11.5 gcloud dataproc clusters create $CLUSTER_NAME \ --region $REGION \ - --image-version=2.0-ubuntu18 \ + --image-version=2.2-ubuntu22 \ --master-machine-type n1-standard-4 \ --master-boot-disk-size 200 \ --num-workers $NUM_WORKERS \ @@ -71,8 +60,6 @@ gcloud dataproc clusters create $CLUSTER_NAME \ --worker-machine-type n1-standard-8 \ --num-worker-local-ssds 1 \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \ - --optional-components=JUPYTER,ZEPPELIN \ - --metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \ --bucket $GCS_BUCKET \ --subnet=default \ --enable-component-gateway diff --git a/spark-rapids/spark-rapids.sh b/spark-rapids/spark-rapids.sh index 492848340..0b4aabd57 100644 --- a/spark-rapids/spark-rapids.sh +++ b/spark-rapids/spark-rapids.sh @@ -216,7 +216,7 @@ else fi # Update SPARK RAPIDS config -readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" +readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0" readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) @@ -261,7 +261,7 @@ IS_MIG_ENABLED=0 function execute_with_retries() { local -r cmd=$1 for ((i = 0; i < 10; i++)); do - if eval "$cmd"; then + if time eval "$cmd"; then return 0 fi sleep 5 @@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() { mkdir -p "${WORKDIR}" pushd $_ # Fetch open souce kernel module with corresponding tag - git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ - --branch "${NVIDIA_DRIVER_VERSION}" --single-branch + test -d open-gpu-kernel-modules || \ + git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \ + --branch "${NVIDIA_DRIVER_VERSION}" --single-branch cd ${WORKDIR}/open-gpu-kernel-modules # # build kernel modules @@ -451,7 +452,7 @@ function install_nvidia_gpu_driver() { curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \ -o cuda.run - bash cuda.run --silent --toolkit --no-opengl-libs + time bash cuda.run --silent --toolkit --no-opengl-libs rm cuda.run else # Install from repo provided by NV @@ -525,7 +526,8 @@ function download_agent(){ mkdir -p /opt/google chmod 777 /opt/google cd /opt/google - execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" + test -d compute-gpu-monitoring || \ + execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git" } function install_agent_dependency(){ diff --git a/spark-rapids/test_spark_rapids.py b/spark-rapids/test_spark_rapids.py index 7af8e3154..6e03f2d62 100644 --- a/spark-rapids/test_spark_rapids.py +++ b/spark-rapids/test_spark_rapids.py @@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator): machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: @@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes, machine_type="n1-standard-4", master_accelerator=accelerator if configuration == "SINGLE" else None, worker_accelerator=accelerator, - boot_disk_size="1024GB", + boot_disk_size="50GB", timeout_in_minutes=30) for machine_suffix in machine_suffixes: