From 6bb948b8a5614c57da41c6297a7597fc716caa62 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 9 Jan 2025 20:20:26 -0800 Subject: [PATCH] [gpu] Exercise new template-generated GPU driver installer gpu/test_gpu.py: * using tests from #1275 gpu/verify_pyspark.py: * new test file ; will probably be moved to mlvm templates/gpu/install_gpu_driver.sh.in: * this action template includes only the code unique to this action --- gpu/BUILD | 6 +- gpu/test_gpu.py | 296 +++++++++++++++++-------- gpu/verify_pyspark.py | 45 ++++ templates/gpu/install_gpu_driver.sh.in | 81 +++++++ 4 files changed, 334 insertions(+), 94 deletions(-) create mode 100644 gpu/verify_pyspark.py create mode 100644 templates/gpu/install_gpu_driver.sh.in diff --git a/gpu/BUILD b/gpu/BUILD index b481c5b33..bd5500ccb 100644 --- a/gpu/BUILD +++ b/gpu/BUILD @@ -6,7 +6,11 @@ py_test( name = "test_gpu", size = "enormous", srcs = ["test_gpu.py"], - data = ["install_gpu_driver.sh", "mig.sh"], + data = [ + "install_gpu_driver.sh", + "verify_pyspark.py", + "mig.sh" + ], local = True, shard_count = 15, deps = [ diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index f8438915f..412d16ddb 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -4,27 +4,27 @@ from absl.testing import absltest from absl.testing import parameterized +import unittest + from integration_tests.dataproc_test_case import DataprocTestCase +DEFAULT_TIMEOUT = 15 # minutes +DEFAULT_CUDA_VERSION = "12.4" class NvidiaGpuDriverTestCase(DataprocTestCase): COMPONENT = "gpu" INIT_ACTIONS = ["gpu/install_gpu_driver.sh"] GPU_L4 = "type=nvidia-l4" GPU_T4 = "type=nvidia-tesla-t4" - GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a - GPU_A100 = "type=nvidia-tesla-a100" GPU_H100 = "type=nvidia-h100-80gb,count=8" def verify_instance(self, name): # Verify that nvidia-smi works - time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience + import random + # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions + time.sleep( 3 + random.randint(1, 30) ) self.assert_instance_command(name, "nvidia-smi", 1) - def verify_pyspark(self, name): - # Verify that pyspark works - self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) - def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -41,6 +41,27 @@ def verify_instance_nvcc(self, name, cuda_version): self.assert_instance_command( name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) ) + def verify_instance_pyspark(self, name): + # Verify that pyspark works + self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1) + + def verify_instance_cuda_version(self, name, cuda_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) ) + + def verify_instance_driver_version(self, name, driver_version): + self.assert_instance_command( + name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) ) + + def verify_pyspark(self): + self.assert_dataproc_job( + self.getClusterName(), + "pyspark", + """--properties="spark.executor.resource.gpu.amount=1" \ + --properties="spark.task.resource.gpu.amount=0.01" \ + '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO) + ) + def verify_instance_spark(self): self.assert_dataproc_job( self.getClusterName(), @@ -56,6 +77,22 @@ def verify_instance_spark(self): + "spark.yarn.unmanagedAM.enabled=false" ) + def verify_driver_signature(self, name): + cert_path='/var/lib/dkms/mok.pub' + if self.getImageOs() == 'ubuntu': + cert_path='/var/lib/shim-signed/mok/MOK.der' + + cert_verification_cmd = """ +perl -Mv5.10 -e ' +my $cert = ( qx{openssl x509 -inform DER -in {} -text} + =~ /Serial Number:.*? +(.+?)\s*$/ms ); +my $kmod = ( qx{modinfo nvidia} + =~ /^sig_key:\s+(\S+)/ms ); +exit 1 unless $cert eq lc $kmod +' +""" + self.assert_instance_command( name, cert_verification_cmd.format(cert_path) ) + @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), # ("STANDARD", ["m"], GPU_T4, None, None), @@ -64,8 +101,13 @@ def verify_instance_spark(self): def test_install_gpu_default_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -73,17 +115,18 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, self.createCluster( configuration, self.INIT_ACTIONS, - machine_type="n1-highmem-8", + machine_type="n1-standard-32", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=90, - boot_disk_size="50GB") + timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) - if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ): - self.verify_pyspark(machine_name) + self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION) + self.verify_instance_pyspark(machine_name) + self.verify_pyspark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, None), @@ -91,13 +134,15 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes, def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - self.skipTest("No need to regularly test not installing the agent") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - metadata = "install-gpu-agent=false" + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + self.skipTest("known to fail") + if driver_provider is not None: metadata += ",gpu-driver-provider={}".format(driver_provider) self.createCluster( @@ -107,22 +152,26 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_pyspark() @parameterized.parameters( - ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None), # ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"), # ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"), ) def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") + self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") + + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + self.skipTest("known to fail") metadata = "install-gpu-agent=true" if driver_provider is not None: @@ -134,40 +183,46 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) + self.verify_pyspark() @parameterized.parameters( -# ("SINGLE", ["m"], GPU_T4, None, "12.0"), - ("SINGLE", ["m"], GPU_T4, None, "11.8"), + ("SINGLE", ["m"], GPU_T4, None, "12.4"), +# ("SINGLE", ["m"], GPU_T4, None, "11.8"), ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"), -# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), + ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"), ) def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) + self.skipTest("known to fail") - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + self.skipTest("known to fail") + metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -177,40 +232,41 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB") + for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_nvcc(machine_name, cuda_version) + self.verify_instance_pyspark(machine_name) + self.verify_pyspark() @parameterized.parameters( - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"), -# ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"), - ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"), + ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"), +# ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"), + ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"), ) def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): - - self.skipTest("Test is known to fail. Skipping so that we can exercise others") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + # Operation [projects/.../regions/.../operations/...] failed: + # Invalid value for field 'resource.machineType': \ + # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \ + # 'machineTypes/a3-highgpu-8g'. \ + # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature.. + # ('This use case not thoroughly tested') + unittest.expectedFailure(self) + self.skipTest("known to fail") + + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version) @@ -218,11 +274,11 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, configuration, self.INIT_ACTIONS, master_machine_type="a3-highgpu-8g", - worker_machine_type="a2-highgpu-2g", + worker_machine_type="a3-highgpu-8g", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", startup_script="gpu/mig.sh") @@ -236,12 +292,12 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty") + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + self.skipTest("known to fail") metadata = None if driver_provider is not None: @@ -255,9 +311,9 @@ def test_gpu_allocation(self, configuration, master_accelerator, master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, boot_disk_size="50GB", - timeout_in_minutes=30) + timeout_in_minutes=90) - self.verify_instance_spark() + self.verify_pyspark() @parameterized.parameters( ("SINGLE", ["m"], GPU_T4, None, "11.8"), @@ -270,26 +326,20 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator, worker_accelerator, cuda_version): - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date") - - if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \ - and configuration == 'SINGLE': - self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty") - - if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ): - self.skipTest("CUDA == 12.0 not supported on debian 12") - - if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \ + if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ): - self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases") + self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases") - if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \ - and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \ + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ and self.getImageVersion() >= pkg_resources.parse_version("2.2"): - self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9") + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'SINGLE' \ + and self.getImageOs() == 'rocky' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') + self.skipTest("known to fail") metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) self.createCluster( @@ -299,16 +349,76 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, metadata=metadata, - timeout_in_minutes=30, + timeout_in_minutes=90, boot_disk_size="50GB", scopes="https://www.googleapis.com/auth/monitoring.write") + for machine_suffix in machine_suffixes: - self.verify_instance("{}-{}".format(self.getClusterName(), - machine_suffix)) - self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(), - machine_suffix)) + machine_name="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(machine_name) + self.verify_instance_gpu_agent(machine_name) + self.verify_pyspark() + + @parameterized.parameters( +# ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''), +# ("STANDARD", ["m"], GPU_T4, None, "12.0"), + ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'), +# ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'), +# ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"), +# ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"), + ) + def tests_driver_signing(self, configuration, machine_suffixes, + master_accelerator, worker_accelerator, + cuda_version, image_os, image_version): + + if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \ + and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" ) + + if configuration == 'KERBEROS' \ + and self.getImageVersion() <= pkg_resources.parse_version("2.1"): + # ('KERBEROS fails with image version <= 2.1') + unittest.expectedFailure(self) + self.skipTest("known to fail") + + kvp_array=[] + import os + + if "private_secret_name" in os.environ: + for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']: + kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) ) + + if kvp_array[0] == "public_secret_name=": + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + else: + self.skipTest("This test only runs when signing environment has been configured in presubmit.sh") + + metadata = ",".join( kvp_array ) + + if self.getImageOs() != image_os: + self.skipTest("This test is only run on os {}".format(image_os)) + if self.getImageVersion() != image_version: + self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os)) + + self.createCluster( + configuration, + self.INIT_ACTIONS, + machine_type="n1-highmem-8", + master_accelerator=master_accelerator, + worker_accelerator=worker_accelerator, + metadata=metadata, + timeout_in_minutes=90, + boot_disk_size="50GB", + scopes="https://www.googleapis.com/auth/monitoring.write") + for machine_suffix in machine_suffixes: + hostname="{}-{}".format(self.getClusterName(),machine_suffix) + self.verify_instance(hostname) + self.verify_instance_gpu_agent(hostname) + self.verify_driver_signature(hostname) - self.verify_instance_spark() + self.verify_pyspark() if __name__ == "__main__": absltest.main() diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py new file mode 100644 index 000000000..9f2b18683 --- /dev/null +++ b/gpu/verify_pyspark.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# +# Copyright 2025 Google LLC and contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import matplotlib.pyplot as plt +import numpy as np + +from pyspark import SparkContext +from pyspark.sql import SparkSession +from pyspark import SparkConf, StorageLevel +from tqdm import tqdm +from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover +import pyspark.sql.functions as f + +spark = SparkSession.builder.appName("spark-rapids").getOrCreate() + +#from utils import SimpleTimer, ResultsLogger, visualize_data + +conf = (SparkConf().setMaster("local[*]") + .setAppName("SparkVectorizer") + .set('spark.driver.memory', '300G') + .set('spark.driver.maxResultSize', '20G') + .set('spark.network.timeout', '7200s') + ) + +sc = SparkContext.getOrCreate(conf=conf) +sc.setLogLevel("FATAL") +spark = SparkSession(sc) +print(sc._conf.getAll()) # check context settings + +x = np.linspace(0, 3*np.pi, 500) +plt.plot(x, np.sin(x**2)) +plt.title('A simple chirp'); diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in new file mode 100644 index 000000000..c2960e00e --- /dev/null +++ b/templates/gpu/install_gpu_driver.sh.in @@ -0,0 +1,81 @@ +#!/bin/bash +# +[% INSERT legal/license_header %] +# +[% PROCESS common/template_disclaimer %] +# +# This script installs NVIDIA GPU drivers and collects GPU utilization metrics. + +set -euxo pipefail + +[% INSERT common/util_functions %] + +[% INSERT common/install_functions %] + +[% INSERT gpu/util_functions %] + +[% INSERT gpu/install_functions %] + +[% INCLUDE gpu/yarn_functions %] + +[% INSERT gpu/spark_functions %] + +function main() { + install_gpu_driver_and_cuda + + #Install GPU metrics collection in Stackdriver if needed + if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + install_gpu_agent +# install_gpu_monitoring_agent + echo 'GPU metrics agent successfully deployed.' + else + echo 'GPU metrics agent has not been installed.' + fi + configure_gpu_exclusive_mode + + setup_gpu_yarn + + echo "yarn setup complete" + + if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then + install_nvidia_nccl + install_nvidia_cudnn + fi + + if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then + install_spark_rapids + configure_gpu_script + echo "RAPIDS initialized with Spark runtime" + elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then + # we are not currently tooled for installing dask in this action. + echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh" + else + echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}" + fi + + echo "main complete" + return 0 +} + +function exit_handler() { + set +e + gpu_install_exit_handler + gpu_exit_handler + pip_exit_handler + yarn_exit_handler + common_exit_handler + return 0 +} + +function prepare_to_install(){ + prepare_spark_env + prepare_common_env + prepare_pip_env + prepare_gpu_env + prepare_gpu_install_env + trap exit_handler EXIT +} + +prepare_to_install + +main