Skip to content

Commit

Permalink
update rapids version for 24.10 release (#1248)
Browse files Browse the repository at this point in the history
* update v2410 rapids release

Signed-off-by: liyuan <[email protected]>

* update the readme doc

Signed-off-by: liyuan <[email protected]>

* update the readme doc

Signed-off-by: liyuan <[email protected]>

* update v2412 version

Signed-off-by: liyuan <[email protected]>

* do not recreate git clone on second pass

* gathering timing data for some long-running sections of the installer

* over-commitment on the disk space cleaned up


---------

Signed-off-by: liyuan <[email protected]>
Co-authored-by: C.J. Collier <[email protected]>
  • Loading branch information
nvliyuan and cjac authored Dec 25, 2024
1 parent 169e98e commit 8089389
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 25 deletions.
19 changes: 3 additions & 16 deletions spark-rapids/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,8 @@ RAPIDS Accelerator For Apache Spark is supported on Dataproc 2.0+ (Spark 3.0)+.
## RAPIDS Accelerator For Apache Spark

### Prerequisites

To use RAPIDS Accelerator For Apache Spark, XGBoost4j with Spark 3

* Apache Spark 3.0+
* Hardware Requirements
* NVIDIA Pascal™ GPU architecture or better (V100, P100, T4 and later)
* Multi-node clusters with homogenous GPU configuration
* Software Requirements
* NVIDIA GPU driver 440.33+
* CUDA v11.5/v11.0/v10.2/v10.1
* NCCL 2.11.4+
* Ubuntu 18.04, Ubuntu 20.04 or Rocky Linux 7, Rocky Linux8, Debian 10, Debian 11
Please find the [RAPIDS Accelerator For Apache Spark](https://nvidia.github.io/spark-rapids/)
official document for the hardware and software [requirements](https://nvidia.github.io/spark-rapids/docs/download.html).

This section describes how to create
[Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster with
Expand Down Expand Up @@ -59,20 +49,17 @@ export GCS_BUCKET=<your bucket for the logs and notebooks>
export REGION=<region>
export NUM_GPUS=1
export NUM_WORKERS=2
export CUDA_VER=11.5

gcloud dataproc clusters create $CLUSTER_NAME \
--region $REGION \
--image-version=2.0-ubuntu18 \
--image-version=2.2-ubuntu22 \
--master-machine-type n1-standard-4 \
--master-boot-disk-size 200 \
--num-workers $NUM_WORKERS \
--worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--worker-machine-type n1-standard-8 \
--num-worker-local-ssds 1 \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/spark-rapids/spark-rapids.sh \
--optional-components=JUPYTER,ZEPPELIN \
--metadata gpu-driver-provider="NVIDIA",rapids-runtime="SPARK",cuda-version="$CUDA_VER" \
--bucket $GCS_BUCKET \
--subnet=default \
--enable-component-gateway
Expand Down
14 changes: 8 additions & 6 deletions spark-rapids/spark-rapids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ else
fi

# Update SPARK RAPIDS config
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
readonly DEFAULT_SPARK_RAPIDS_VERSION="24.12.0"
readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})

Expand Down Expand Up @@ -261,7 +261,7 @@ IS_MIG_ENABLED=0
function execute_with_retries() {
local -r cmd=$1
for ((i = 0; i < 10; i++)); do
if eval "$cmd"; then
if time eval "$cmd"; then
return 0
fi
sleep 5
Expand Down Expand Up @@ -418,8 +418,9 @@ function install_nvidia_gpu_driver() {
mkdir -p "${WORKDIR}"
pushd $_
# Fetch open souce kernel module with corresponding tag
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
--branch "${NVIDIA_DRIVER_VERSION}" --single-branch
test -d open-gpu-kernel-modules || \
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git \
--branch "${NVIDIA_DRIVER_VERSION}" --single-branch
cd ${WORKDIR}/open-gpu-kernel-modules
#
# build kernel modules
Expand Down Expand Up @@ -451,7 +452,7 @@ function install_nvidia_gpu_driver() {
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
"https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/local_installers/${cuda_runfile}" \
-o cuda.run
bash cuda.run --silent --toolkit --no-opengl-libs
time bash cuda.run --silent --toolkit --no-opengl-libs
rm cuda.run
else
# Install from repo provided by NV
Expand Down Expand Up @@ -525,7 +526,8 @@ function download_agent(){
mkdir -p /opt/google
chmod 777 /opt/google
cd /opt/google
execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
test -d compute-gpu-monitoring || \
execute_with_retries "git clone https://github.com/GoogleCloudPlatform/compute-gpu-monitoring.git"
}

function install_agent_dependency(){
Expand Down
6 changes: 3 additions & 3 deletions spark-rapids/test_spark_rapids.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_spark_rapids(self, configuration, machine_suffixes, accelerator):
machine_type="n1-standard-4",
master_accelerator=accelerator if configuration == "SINGLE" else None,
worker_accelerator=accelerator,
boot_disk_size="1024GB",
boot_disk_size="50GB",
timeout_in_minutes=30)

for machine_suffix in machine_suffixes:
Expand Down Expand Up @@ -105,7 +105,7 @@ def test_spark_rapids_sql(self, configuration, machine_suffixes, accelerator):
machine_type="n1-standard-4",
master_accelerator=accelerator if configuration == "SINGLE" else None,
worker_accelerator=accelerator,
boot_disk_size="1024GB",
boot_disk_size="50GB",
timeout_in_minutes=30)

for machine_suffix in machine_suffixes:
Expand Down Expand Up @@ -134,7 +134,7 @@ def test_non_default_cuda_versions(self, configuration, machine_suffixes,
machine_type="n1-standard-4",
master_accelerator=accelerator if configuration == "SINGLE" else None,
worker_accelerator=accelerator,
boot_disk_size="1024GB",
boot_disk_size="50GB",
timeout_in_minutes=30)

for machine_suffix in machine_suffixes:
Expand Down

0 comments on commit 8089389

Please sign in to comment.