From 6bb948b8a5614c57da41c6297a7597fc716caa62 Mon Sep 17 00:00:00 2001
From: "C.J. Collier" <cjac@google.com>
Date: Thu, 9 Jan 2025 20:20:26 -0800
Subject: [PATCH] [gpu] Exercise new template-generated GPU driver installer
 gpu/test_gpu.py: * using tests from #1275

gpu/verify_pyspark.py:
* new test file ; will probably be moved to mlvm

templates/gpu/install_gpu_driver.sh.in:
* this action template includes only the code unique to this action
---
 gpu/BUILD                              |   6 +-
 gpu/test_gpu.py                        | 296 +++++++++++++++++--------
 gpu/verify_pyspark.py                  |  45 ++++
 templates/gpu/install_gpu_driver.sh.in |  81 +++++++
 4 files changed, 334 insertions(+), 94 deletions(-)
 create mode 100644 gpu/verify_pyspark.py
 create mode 100644 templates/gpu/install_gpu_driver.sh.in

diff --git a/gpu/BUILD b/gpu/BUILD
index b481c5b33..bd5500ccb 100644
--- a/gpu/BUILD
+++ b/gpu/BUILD
@@ -6,7 +6,11 @@ py_test(
     name = "test_gpu",
     size = "enormous",
     srcs = ["test_gpu.py"],
-    data = ["install_gpu_driver.sh", "mig.sh"],
+    data = [
+      "install_gpu_driver.sh",
+      "verify_pyspark.py",
+      "mig.sh"
+    ],
     local = True,
     shard_count = 15,
     deps = [
diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py
index f8438915f..412d16ddb 100644
--- a/gpu/test_gpu.py
+++ b/gpu/test_gpu.py
@@ -4,27 +4,27 @@
 from absl.testing import absltest
 from absl.testing import parameterized
 
+import unittest
+
 from integration_tests.dataproc_test_case import DataprocTestCase
 
+DEFAULT_TIMEOUT = 15  # minutes
+DEFAULT_CUDA_VERSION = "12.4"
 
 class NvidiaGpuDriverTestCase(DataprocTestCase):
   COMPONENT = "gpu"
   INIT_ACTIONS = ["gpu/install_gpu_driver.sh"]
   GPU_L4   = "type=nvidia-l4"
   GPU_T4   = "type=nvidia-tesla-t4"
-  GPU_V100 = "type=nvidia-tesla-v100" # not available in us-central1-a
-  GPU_A100 = "type=nvidia-tesla-a100"
   GPU_H100 = "type=nvidia-h100-80gb,count=8"
 
   def verify_instance(self, name):
     # Verify that nvidia-smi works
-    time.sleep(3) # Many failed nvidia-smi attempts have been caused by impatience
+    import random
+    # Many failed nvidia-smi attempts have been caused by impatience and temporal collisions
+    time.sleep( 3 + random.randint(1, 30) )
     self.assert_instance_command(name, "nvidia-smi", 1)
 
-  def verify_pyspark(self, name):
-    # Verify that pyspark works
-    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
-
   def verify_mig_instance(self, name):
     self.assert_instance_command(name,
         "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'")
@@ -41,6 +41,27 @@ def verify_instance_nvcc(self, name, cuda_version):
     self.assert_instance_command(
         name, "/usr/local/cuda-{}/bin/nvcc --version | grep 'release {}'".format(cuda_version,cuda_version) )
 
+  def verify_instance_pyspark(self, name):
+    # Verify that pyspark works
+    self.assert_instance_command(name, "echo 'from pyspark.sql import SparkSession ; SparkSession.builder.getOrCreate()' | pyspark -c spark.executor.resource.gpu.amount=1 -c spark.task.resource.gpu.amount=0.01", 1)
+
+  def verify_instance_cuda_version(self, name, cuda_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/cuda_version/text()' - | grep {}".format(cuda_version) )
+
+  def verify_instance_driver_version(self, name, driver_version):
+    self.assert_instance_command(
+        name, "nvidia-smi -q -x | /opt/conda/default/bin/xmllint --xpath '//nvidia_smi_log/driver_version/text()' - | grep {}".format(driver_version) )
+
+  def verify_pyspark(self):
+    self.assert_dataproc_job(
+      self.getClusterName(),
+      "pyspark",
+      """--properties="spark.executor.resource.gpu.amount=1" \
+         --properties="spark.task.resource.gpu.amount=0.01" \
+         '{}/gpu/verify_pyspark.py'""".format(self.INIT_ACTIONS_REPO)
+    )
+
   def verify_instance_spark(self):
     self.assert_dataproc_job(
       self.getClusterName(),
@@ -56,6 +77,22 @@ def verify_instance_spark(self):
       +   "spark.yarn.unmanagedAM.enabled=false"
     )
 
+  def verify_driver_signature(self, name):
+    cert_path='/var/lib/dkms/mok.pub'
+    if self.getImageOs() == 'ubuntu':
+      cert_path='/var/lib/shim-signed/mok/MOK.der'
+
+    cert_verification_cmd = """
+perl -Mv5.10 -e '
+my $cert = ( qx{openssl x509 -inform DER -in {} -text}
+             =~ /Serial Number:.*? +(.+?)\s*$/ms );
+my $kmod = ( qx{modinfo nvidia}
+             =~ /^sig_key:\s+(\S+)/ms );
+exit 1 unless $cert eq lc $kmod
+'
+"""
+    self.assert_instance_command( name, cert_verification_cmd.format(cert_path) )
+
   @parameterized.parameters(
       ("SINGLE",   ["m"], GPU_T4, None, None),
 #      ("STANDARD", ["m"], GPU_T4, None, None),
@@ -64,8 +101,13 @@ def verify_instance_spark(self):
   def test_install_gpu_default_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -73,17 +115,18 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
     self.createCluster(
         configuration,
         self.INIT_ACTIONS,
-        machine_type="n1-highmem-8",
+        machine_type="n1-standard-32",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=90,
-        boot_disk_size="50GB")
+        timeout_in_minutes=90, # This cluster is sized and timed appropriately to build the kernel driver and nccl
+        boot_disk_size="60GB")
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
-      if ( self.getImageOs() != 'rocky' ) or ( configuration != 'SINGLE' ) or ( configuration == 'SINGLE' and self.getImageOs() == 'rocky' and self.getImageVersion() > pkg_resources.parse_version("2.1") ):
-        self.verify_pyspark(machine_name)
+      self.verify_instance_nvcc(machine_name, DEFAULT_CUDA_VERSION)
+      self.verify_instance_pyspark(machine_name)
+    self.verify_pyspark()
 
   @parameterized.parameters(
       ("SINGLE", ["m"], GPU_T4, None, None),
@@ -91,13 +134,15 @@ def test_install_gpu_default_agent(self, configuration, machine_suffixes,
   def test_install_gpu_without_agent(self, configuration, machine_suffixes,
                                      master_accelerator, worker_accelerator,
                                      driver_provider):
-
     self.skipTest("No need to regularly test not installing the agent")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
     metadata = "install-gpu-agent=false"
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      self.skipTest("known to fail")
+
     if driver_provider is not None:
       metadata += ",gpu-driver-provider={}".format(driver_provider)
     self.createCluster(
@@ -107,22 +152,26 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+    self.verify_pyspark()
   @parameterized.parameters(
-      ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, None),
 #      ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "NVIDIA"),
 #      ("STANDARD", ["m"], GPU_T4, None, "NVIDIA"),
   )
   def test_install_gpu_with_agent(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
+    self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere")
+
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true"
     if driver_provider is not None:
@@ -134,40 +183,46 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         scopes="https://www.googleapis.com/auth/monitoring.write")
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
+    self.verify_pyspark()
 
   @parameterized.parameters(
-#       ("SINGLE", ["m"],               GPU_T4, None,   "12.0"),
-        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
+        ("SINGLE", ["m"],               GPU_T4, None,   "12.4"),
+#        ("SINGLE", ["m"],               GPU_T4, None,   "11.8"),
       ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4"),
-#     ("STANDARD", ["w-0", "w-1"],      None,   GPU_T4, "11.8"),
+      ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "11.8"),
   )
   def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
 
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      self.skipTest("known to fail")
+
 
     metadata = "gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -177,40 +232,41 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB")
+
     for machine_suffix in machine_suffixes:
       machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
       self.verify_instance(machine_name)
       self.verify_instance_nvcc(machine_name, cuda_version)
+      self.verify_instance_pyspark(machine_name)
+    self.verify_pyspark()
 
   @parameterized.parameters(
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "11.8"),
-#      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.0"),
-      ("STANDARD", ["m"], GPU_H100, GPU_A100, "NVIDIA", "12.4"),
+      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "11.8"),
+#      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.0"),
+      ("STANDARD", ["m"], GPU_H100, GPU_H100, "NVIDIA", "12.4"),
   )
   def test_install_gpu_with_mig(self, configuration, machine_suffixes,
                                   master_accelerator, worker_accelerator,
                                   driver_provider, cuda_version):
-
-    self.skipTest("Test is known to fail.  Skipping so that we can exercise others")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    # Operation [projects/.../regions/.../operations/...] failed:
+    # Invalid value for field 'resource.machineType': \
+    # 'https://www.googleapis.com/compute/v1/projects/.../zones/.../' \
+    # 'machineTypes/a3-highgpu-8g'. \
+    # NetworkInterface NicType can only be set to GVNIC on instances with GVNIC GuestOsFeature..
+    # ('This use case not thoroughly tested')
+    unittest.expectedFailure(self)
+    self.skipTest("known to fail")
+
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
 
     metadata = "gpu-driver-provider={},cuda-version={}".format(driver_provider, cuda_version)
 
@@ -218,11 +274,11 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
         configuration,
         self.INIT_ACTIONS,
         master_machine_type="a3-highgpu-8g",
-        worker_machine_type="a2-highgpu-2g",
+        worker_machine_type="a3-highgpu-8g",
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         metadata=metadata,
-        timeout_in_minutes=30,
+        timeout_in_minutes=90,
         boot_disk_size="50GB",
         startup_script="gpu/mig.sh")
 
@@ -236,12 +292,12 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes,
   )
   def test_gpu_allocation(self, configuration, master_accelerator,
                           worker_accelerator, driver_provider):
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests are known to fail with errors about nodes_include being empty")
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      self.skipTest("known to fail")
 
     metadata = None
     if driver_provider is not None:
@@ -255,9 +311,9 @@ def test_gpu_allocation(self, configuration, master_accelerator,
         master_accelerator=master_accelerator,
         worker_accelerator=worker_accelerator,
         boot_disk_size="50GB",
-        timeout_in_minutes=30)
+        timeout_in_minutes=90)
 
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
   @parameterized.parameters(
     ("SINGLE", ["m"], GPU_T4, None, "11.8"),
@@ -270,26 +326,20 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
                                    master_accelerator, worker_accelerator,
                                    cuda_version):
 
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("GPU drivers are currently FTBFS on Rocky 9 ; base dataproc image out of date")
-
-    if ( self.getImageOs() == 'rocky' ) and self.getImageVersion() <= pkg_resources.parse_version("2.1") \
-    and configuration == 'SINGLE':
-      self.skipTest("2.1-rocky8 and 2.0-rocky8 single instance tests fail with errors about nodes_include being empty")
-
-    if pkg_resources.parse_version(cuda_version) == pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' and self.getImageVersion() >= pkg_resources.parse_version("2.2") ):
-      self.skipTest("CUDA == 12.0 not supported on debian 12")
-
-    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.0") \
+    if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \
     and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \
           ( self.getImageOs() == 'debian' and self.getImageVersion() <= pkg_resources.parse_version("2.1") ) ):
-      self.skipTest("CUDA > 12.0 not supported on older debian/ubuntu releases")
+      self.skipTest("CUDA > 12.4 not supported on older debian/ubuntu releases")
 
-    if pkg_resources.parse_version(cuda_version) < pkg_resources.parse_version("12.0") \
-    and ( self.getImageOs() == 'debian' or self.getImageOs() == 'rocky' ) \
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
     and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
-      self.skipTest("CUDA < 12 not supported on Debian >= 12, Rocky >= 9")
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'SINGLE' \
+    and self.getImageOs() == 'rocky' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty')
+      self.skipTest("known to fail")
 
     metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version)
     self.createCluster(
@@ -299,16 +349,76 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf
       master_accelerator=master_accelerator,
       worker_accelerator=worker_accelerator,
       metadata=metadata,
-      timeout_in_minutes=30,
+      timeout_in_minutes=90,
       boot_disk_size="50GB",
       scopes="https://www.googleapis.com/auth/monitoring.write")
+
     for machine_suffix in machine_suffixes:
-      self.verify_instance("{}-{}".format(self.getClusterName(),
-                                          machine_suffix))
-      self.verify_instance_gpu_agent("{}-{}".format(self.getClusterName(),
-                                                    machine_suffix))
+      machine_name="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(machine_name)
+      self.verify_instance_gpu_agent(machine_name)
+    self.verify_pyspark()
+
+  @parameterized.parameters(
+#    ("SINGLE", ["m"], GPU_T4, GPU_T4, "11.8", ''),
+#    ("STANDARD", ["m"], GPU_T4, None, "12.0"),
+    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.1.1", 'rocky', '2.0'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.4", 'rocky', '2.1'),
+#    ("STANDARD", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.0", 'rocky', '2.2'),
+#    ("KERBEROS", ["m", "w-0", "w-1"], GPU_T4, GPU_T4, "12.6", 'rocky', '2.2'),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "11.8"),
+#    ("STANDARD", ["w-0", "w-1"], None, GPU_T4, "12.0"),
+  )
+  def tests_driver_signing(self, configuration, machine_suffixes,
+                           master_accelerator, worker_accelerator,
+                           cuda_version, image_os, image_version):
+
+    if pkg_resources.parse_version(cuda_version) <= pkg_resources.parse_version("12.0") \
+    and self.getImageVersion() >= pkg_resources.parse_version("2.2"):
+      self.skipTest( "Kernel driver FTBFS with older CUDA versions on image version >= 2.2" )
+
+    if configuration == 'KERBEROS' \
+    and self.getImageVersion() <= pkg_resources.parse_version("2.1"):
+      # ('KERBEROS fails with image version <= 2.1')
+      unittest.expectedFailure(self)
+      self.skipTest("known to fail")
+
+    kvp_array=[]
+    import os
+
+    if "private_secret_name" in os.environ:
+      for env_var in ['public_secret_name', 'private_secret_name', 'secret_project', 'secret_version' 'modulus_md5sum']:
+        kvp_array.append( "{}={}".format( env_var, os.environ[env_var] ) )
+
+      if kvp_array[0] == "public_secret_name=":
+        self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+    else:
+      self.skipTest("This test only runs when signing environment has been configured in presubmit.sh")
+
+    metadata = ",".join( kvp_array )
+
+    if self.getImageOs() != image_os:
+      self.skipTest("This test is only run on os {}".format(image_os))
+    if self.getImageVersion() != image_version:
+      self.skipTest("This test is only run on Dataproc Image Version {}".format(image_os))
+
+    self.createCluster(
+      configuration,
+      self.INIT_ACTIONS,
+      machine_type="n1-highmem-8",
+      master_accelerator=master_accelerator,
+      worker_accelerator=worker_accelerator,
+      metadata=metadata,
+      timeout_in_minutes=90,
+      boot_disk_size="50GB",
+      scopes="https://www.googleapis.com/auth/monitoring.write")
+    for machine_suffix in machine_suffixes:
+      hostname="{}-{}".format(self.getClusterName(),machine_suffix)
+      self.verify_instance(hostname)
+      self.verify_instance_gpu_agent(hostname)
+      self.verify_driver_signature(hostname)
 
-    self.verify_instance_spark()
+    self.verify_pyspark()
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/gpu/verify_pyspark.py b/gpu/verify_pyspark.py
new file mode 100644
index 000000000..9f2b18683
--- /dev/null
+++ b/gpu/verify_pyspark.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3 
+# 
+# Copyright 2025 Google LLC and contributors
+# 
+# Licensed under the Apache License, Version 2.0 (the "License"); 
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+# 
+#      http://www.apache.org/licenses/LICENSE-2.0 
+# 
+# Unless required by applicable law or agreed to in writing, software 
+# distributed under the License is distributed on an "AS-IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+# See the License for the specific language governing permissions and 
+# limitations under the License. 
+# 
+import matplotlib.pyplot as plt
+import numpy as np
+
+from pyspark import SparkContext
+from pyspark.sql import SparkSession
+from pyspark import SparkConf, StorageLevel
+from tqdm import tqdm
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
+import pyspark.sql.functions as f
+
+spark = SparkSession.builder.appName("spark-rapids").getOrCreate()
+
+#from utils import SimpleTimer, ResultsLogger, visualize_data 
+
+conf = (SparkConf().setMaster("local[*]")
+                   .setAppName("SparkVectorizer")
+                   .set('spark.driver.memory', '300G')
+                   .set('spark.driver.maxResultSize', '20G')
+                   .set('spark.network.timeout', '7200s')
+        )
+
+sc = SparkContext.getOrCreate(conf=conf)
+sc.setLogLevel("FATAL")
+spark = SparkSession(sc)
+print(sc._conf.getAll()) # check context settings 
+
+x = np.linspace(0, 3*np.pi, 500)
+plt.plot(x, np.sin(x**2))
+plt.title('A simple chirp');
diff --git a/templates/gpu/install_gpu_driver.sh.in b/templates/gpu/install_gpu_driver.sh.in
new file mode 100644
index 000000000..c2960e00e
--- /dev/null
+++ b/templates/gpu/install_gpu_driver.sh.in
@@ -0,0 +1,81 @@
+#!/bin/bash
+#
+[% INSERT legal/license_header %]
+#
+[% PROCESS common/template_disclaimer %]
+#
+# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.
+
+set -euxo pipefail
+
+[% INSERT common/util_functions %]
+
+[% INSERT common/install_functions %]
+
+[% INSERT gpu/util_functions %]
+
+[% INSERT gpu/install_functions %]
+
+[% INCLUDE gpu/yarn_functions %]
+
+[% INSERT gpu/spark_functions %]
+
+function main() {
+  install_gpu_driver_and_cuda
+
+  #Install GPU metrics collection in Stackdriver if needed
+  if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
+    install_gpu_agent
+#    install_gpu_monitoring_agent
+    echo 'GPU metrics agent successfully deployed.'
+  else
+    echo 'GPU metrics agent has not been installed.'
+  fi
+  configure_gpu_exclusive_mode
+
+  setup_gpu_yarn
+
+  echo "yarn setup complete"
+
+  if ( test -v CUDNN_VERSION && [[ -n "${CUDNN_VERSION}" ]] ) ; then
+    install_nvidia_nccl
+    install_nvidia_cudnn
+  fi
+
+  if [[ "${RAPIDS_RUNTIME}" == "SPARK" ]]; then
+    install_spark_rapids
+    configure_gpu_script
+    echo "RAPIDS initialized with Spark runtime"
+  elif [[ "${RAPIDS_RUNTIME}" == "DASK" ]]; then
+    # we are not currently tooled for installing dask in this action.
+    echo "RAPIDS recognizes DASK runtime - currently supported using dask/dask.sh or rapids/rapids.sh"
+  else
+    echo "Unrecognized RAPIDS Runtime: ${RAPIDS_RUNTIME}"
+  fi
+
+  echo "main complete"
+  return 0
+}
+
+function exit_handler() {
+  set +e
+  gpu_install_exit_handler
+  gpu_exit_handler
+  pip_exit_handler
+  yarn_exit_handler
+  common_exit_handler
+  return 0
+}
+
+function prepare_to_install(){
+  prepare_spark_env
+  prepare_common_env
+  prepare_pip_env
+  prepare_gpu_env
+  prepare_gpu_install_env
+  trap exit_handler EXIT
+}
+
+prepare_to_install
+
+main