populate descriptions and add default spark values

Signed-off-by: Ahmed Hussein (amahussein) <[email protected]>
NVIDIA · Jan 30, 2025 · 4b095d5 · 4b095d5
1 parent cda52a2
commit 4b095d5
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 109 deletions.
diff --git a/core/src/main/resources/bootstrap/tuningTable.yaml b/core/src/main/resources/bootstrap/tuningTable.yaml
@@ -13,33 +13,38 @@
 # limitations under the License.
 
 tuningDefinitions:
+  - label: spark.databricks.adaptive.autoOptimizeShuffle.enabled
+    description: 'Auto-Optimized shuffle. It is recommended to turn it off to set (spark.sql.shuffle.partitions) manually.'
+    enabled: true
+    level: job
+    category: tuning
   - label: spark.executor.cores
-    description: ''
+    description: 'The number of cores to use on each executor. It is recommended to be set to 16'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.executor.instances
-    description: ''
+    description: 'Controls parellelism level. It is recommended to be set to (cpuCoresPerNode * numWorkers) / spark.executor.cores.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.executor.memory
-    description: ''
+    description: 'Amount of memory to use per executor process. This is tuned based on the available CPU memory on worker node.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.executor.memoryOverhead
-    description: ''
+    description: 'Amount of additional memory to be allocated per executor process, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.executor.memoryOverheadFactor
-    description: ''
+    description: 'Fraction of executor memory to be allocated as additional non-heap memory per executor process. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.kubernetes.memoryOverheadFactor
-    description: ''
+    description: 'Specific to K8s. Fraction of executor memory to be allocated as additional non-heap memory per executor process.'
     enabled: true
     level: cluster
     category: tuning
@@ -48,22 +53,24 @@ tuningDefinitions:
     enabled: true
     level: cluster
     category: tuning
-    defaultSpark: 3s
+    defaultSpark: "3s"
   - label: spark.rapids.filecache.enabled
-    description: ''
+    description: 'Enables RAPIDS file cache. The file cache stores data locally in the same local directories that have been configured for the Spark executor.'
     enabled: true
     level: job
+    category: tuning
   - label: spark.rapids.memory.pinnedPool.size
-    description: ''
+    description: 'The size of the pinned memory pool in bytes unless otherwise specified. Use 0 to disable the pool.'
     enabled: true
     level: cluster
+    category: tuning
   - label: spark.rapids.shuffle.multiThreaded.maxBytesInFlight
-    description: ''
+    description: 'This property controls the amount of bytes we allow in flight per Spark task. This typically happens on the reader side, when blocks are received from the network, they’re queued onto these threads for decompression and decode. '
     enabled: true
     level: cluster
     category: tuning
   - label: spark.rapids.shuffle.multiThreaded.reader.threads
-    description: ''
+    description: 'The shuffle reader is a single implementation irrespective of the number of partitions. Set the value to zero to turn off multi-threaded reader entirely.'
     enabled: true
     level: cluster
     category: tuning
@@ -73,86 +80,91 @@ tuningDefinitions:
     level: cluster
     category: tuning
   - label: spark.rapids.sql.batchSizeBytes
-    description: ''
+    description: 'Set the target number of bytes for a GPU batch. Splits sizes for input data is covered by separate configs.'
     enabled: true
     level: job
     category: tuning
   - label: spark.rapids.sql.concurrentGpuTasks
-    description: ''
+    description: 'Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.rapids.sql.format.parquet.multithreaded.combine.waitTime
-    description: ''
+    description: 'When using the multithreaded parquet reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven’t met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this. DEPRECATED: use spark.rapids.sql.reader.multithreaded.combine.waitTime instead.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.rapids.sql.enabled
-    description: 'should be true to enable SQL operations on the GPU.'
+    description: 'Should be true to enable SQL operations on the GPU.'
     enabled: true
     level: cluster
     category: functionality
   - label: spark.rapids.sql.multiThreadedRead.numThreads
-    description: ''
+    description: 'The maximum number of threads on each executor to use for reading small files in parallel.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.rapids.sql.reader.multithreaded.combine.sizeBytes
-    description: ''
+    description: 'The target size in bytes to combine multiple small files together when using the MULTITHREADED parquet or orc reader. With combine disabled, the MULTITHREADED reader reads the files in parallel and sends individual files down to the GPU, but that can be inefficient for small files.'
     enabled: true
     level: cluster
     category: tuning
   - label: spark.shuffle.manager
-    description: ''
+    description: 'The RAPIDS Shuffle Manager is an implementation of the ShuffleManager interface in Apache Spark that allows custom mechanisms to exchange shuffle data. We currently expose two modes of operation: Multi Threaded and UCX.'
     enabled: true
     level: cluster
+    category: tuning
   - label: spark.sql.adaptive.enabled
-    description: ''
+    description: 'When true, enable adaptive query execution, which re-optimizes the query plan in the middle of query execution, based on accurate runtime statistics.'
     enabled: true
     level: job
     category: tuning
+    defaultSpark: "true"
   - label: spark.sql.adaptive.advisoryPartitionSizeInBytes
-    description: ''
+    description: 'The advisory size in bytes of the shuffle partition during adaptive optimization (when spark.sql.adaptive.enabled is true). It takes effect when Spark coalesces small shuffle partitions or splits skewed shuffle partition.'
     enabled: true
     level: job
     category: tuning
   - label: spark.sql.adaptive.coalescePartitions.initialPartitionNum
-    description: ''
+    description: 'The initial number of shuffle partitions before coalescing. If not set, it equals to spark.sql.shuffle.partitions.'
     enabled: true
     level: job
     category: tuning
   - label: spark.sql.adaptive.coalescePartitions.minPartitionNum
-    description: ''
+    description: '(deprecated) The suggested (not guaranteed) minimum number of shuffle partitions after coalescing. If not set, the default value is the default parallelism of the Spark cluster.'
     enabled: true
     level: job
     category: tuning
   - label: spark.sql.adaptive.coalescePartitions.minPartitionSize
-    description: ''
+    description: 'The minimum size of shuffle partitions after coalescing. This is useful when the adaptively calculated target size is too small during partition coalescing.'
     enabled: true
     level: job
     category: tuning
+    defaultSpark: "1m"
   - label: spark.sql.adaptive.coalescePartitions.parallelismFirst
-    description: ''
+    description: 'When true, Spark does not respect the target size specified by (spark.sql.adaptive.advisoryPartitionSizeInBytes) (default 64MB) when coalescing contiguous shuffle partitions, but adaptively calculate the target size according to the default parallelism of the Spark cluster.'
     enabled: true
     level: job
     category: tuning
+    defaultSpark: "true"
   - label: spark.sql.adaptive.autoBroadcastJoinThreshold
-    description: ''
+    description: 'Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when performing a join. By setting this value to -1, broadcasting can be disabled.'
     enabled: true
     level: job
     category: tuning
   - label: spark.sql.files.maxPartitionBytes
-    description: ''
+    description: 'The maximum number of bytes to pack into a single partition when reading files. This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.'
     enabled: true
     level: job
     category: tuning
   - label: spark.sql.shuffle.partitions
-    description: ''
+    description: 'The default number of partitions to use when shuffling data for joins or aggregations. Note: For structured streaming, this configuration cannot be changed between query restarts from the same checkpoint location.'
     enabled: true
     level: job
     category: tuning
+    defaultSpark: "200"
   - label: spark.task.resource.gpu.amount
-    description: ''
+    description: 'The GPU resource amount per task when Apache Spark schedules GPU resources. For example, setting the value to 1 means that only one task will run concurrently per executor.'
     enabled: true
     level: cluster
     category: tuning
diff --git a/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala b/core/src/main/scala/com/nvidia/spark/rapids/tool/tuning/AutoTuner.scala
@@ -719,6 +719,7 @@ class AutoTuner(
   }
 
   private def recommendAQEProperties(): Unit = {
+    // Spark configuration (AQE is enabled by default)
     val aqeEnabled = getPropertyValue("spark.sql.adaptive.enabled")
       .getOrElse("false").toLowerCase
     if (aqeEnabled == "false") {