Skip to content

Commit

Permalink
populate descriptions and add default spark values
Browse files Browse the repository at this point in the history
Signed-off-by: Ahmed Hussein (amahussein) <[email protected]>
  • Loading branch information
amahussein committed Jan 30, 2025
1 parent cda52a2 commit 4b095d5
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 109 deletions.
68 changes: 40 additions & 28 deletions core/src/main/resources/bootstrap/tuningTable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,38 @@
# limitations under the License.

tuningDefinitions:
- label: spark.databricks.adaptive.autoOptimizeShuffle.enabled
description: 'Auto-Optimized shuffle. It is recommended to turn it off to set (spark.sql.shuffle.partitions) manually.'
enabled: true
level: job
category: tuning
- label: spark.executor.cores
description: ''
description: 'The number of cores to use on each executor. It is recommended to be set to 16'
enabled: true
level: cluster
category: tuning
- label: spark.executor.instances
description: ''
description: 'Controls parellelism level. It is recommended to be set to (cpuCoresPerNode * numWorkers) / spark.executor.cores.'
enabled: true
level: cluster
category: tuning
- label: spark.executor.memory
description: ''
description: 'Amount of memory to use per executor process. This is tuned based on the available CPU memory on worker node.'
enabled: true
level: cluster
category: tuning
- label: spark.executor.memoryOverhead
description: ''
description: 'Amount of additional memory to be allocated per executor process, in MiB unless otherwise specified. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size.'
enabled: true
level: cluster
category: tuning
- label: spark.executor.memoryOverheadFactor
description: ''
description: 'Fraction of executor memory to be allocated as additional non-heap memory per executor process. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size.'
enabled: true
level: cluster
category: tuning
- label: spark.kubernetes.memoryOverheadFactor
description: ''
description: 'Specific to K8s. Fraction of executor memory to be allocated as additional non-heap memory per executor process.'
enabled: true
level: cluster
category: tuning
Expand All @@ -48,22 +53,24 @@ tuningDefinitions:
enabled: true
level: cluster
category: tuning
defaultSpark: 3s
defaultSpark: "3s"
- label: spark.rapids.filecache.enabled
description: ''
description: 'Enables RAPIDS file cache. The file cache stores data locally in the same local directories that have been configured for the Spark executor.'
enabled: true
level: job
category: tuning
- label: spark.rapids.memory.pinnedPool.size
description: ''
description: 'The size of the pinned memory pool in bytes unless otherwise specified. Use 0 to disable the pool.'
enabled: true
level: cluster
category: tuning
- label: spark.rapids.shuffle.multiThreaded.maxBytesInFlight
description: ''
description: 'This property controls the amount of bytes we allow in flight per Spark task. This typically happens on the reader side, when blocks are received from the network, they’re queued onto these threads for decompression and decode. '
enabled: true
level: cluster
category: tuning
- label: spark.rapids.shuffle.multiThreaded.reader.threads
description: ''
description: 'The shuffle reader is a single implementation irrespective of the number of partitions. Set the value to zero to turn off multi-threaded reader entirely.'
enabled: true
level: cluster
category: tuning
Expand All @@ -73,86 +80,91 @@ tuningDefinitions:
level: cluster
category: tuning
- label: spark.rapids.sql.batchSizeBytes
description: ''
description: 'Set the target number of bytes for a GPU batch. Splits sizes for input data is covered by separate configs.'
enabled: true
level: job
category: tuning
- label: spark.rapids.sql.concurrentGpuTasks
description: ''
description: 'Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.'
enabled: true
level: cluster
category: tuning
- label: spark.rapids.sql.format.parquet.multithreaded.combine.waitTime
description: ''
description: 'When using the multithreaded parquet reader with combine mode, how long to wait, in milliseconds, for more files to finish if haven’t met the size threshold. Note that this will wait this amount of time from when the last file was available, so total wait time could be larger then this. DEPRECATED: use spark.rapids.sql.reader.multithreaded.combine.waitTime instead.'
enabled: true
level: cluster
category: tuning
- label: spark.rapids.sql.enabled
description: 'should be true to enable SQL operations on the GPU.'
description: 'Should be true to enable SQL operations on the GPU.'
enabled: true
level: cluster
category: functionality
- label: spark.rapids.sql.multiThreadedRead.numThreads
description: ''
description: 'The maximum number of threads on each executor to use for reading small files in parallel.'
enabled: true
level: cluster
category: tuning
- label: spark.rapids.sql.reader.multithreaded.combine.sizeBytes
description: ''
description: 'The target size in bytes to combine multiple small files together when using the MULTITHREADED parquet or orc reader. With combine disabled, the MULTITHREADED reader reads the files in parallel and sends individual files down to the GPU, but that can be inefficient for small files.'
enabled: true
level: cluster
category: tuning
- label: spark.shuffle.manager
description: ''
description: 'The RAPIDS Shuffle Manager is an implementation of the ShuffleManager interface in Apache Spark that allows custom mechanisms to exchange shuffle data. We currently expose two modes of operation: Multi Threaded and UCX.'
enabled: true
level: cluster
category: tuning
- label: spark.sql.adaptive.enabled
description: ''
description: 'When true, enable adaptive query execution, which re-optimizes the query plan in the middle of query execution, based on accurate runtime statistics.'
enabled: true
level: job
category: tuning
defaultSpark: "true"
- label: spark.sql.adaptive.advisoryPartitionSizeInBytes
description: ''
description: 'The advisory size in bytes of the shuffle partition during adaptive optimization (when spark.sql.adaptive.enabled is true). It takes effect when Spark coalesces small shuffle partitions or splits skewed shuffle partition.'
enabled: true
level: job
category: tuning
- label: spark.sql.adaptive.coalescePartitions.initialPartitionNum
description: ''
description: 'The initial number of shuffle partitions before coalescing. If not set, it equals to spark.sql.shuffle.partitions.'
enabled: true
level: job
category: tuning
- label: spark.sql.adaptive.coalescePartitions.minPartitionNum
description: ''
description: '(deprecated) The suggested (not guaranteed) minimum number of shuffle partitions after coalescing. If not set, the default value is the default parallelism of the Spark cluster.'
enabled: true
level: job
category: tuning
- label: spark.sql.adaptive.coalescePartitions.minPartitionSize
description: ''
description: 'The minimum size of shuffle partitions after coalescing. This is useful when the adaptively calculated target size is too small during partition coalescing.'
enabled: true
level: job
category: tuning
defaultSpark: "1m"
- label: spark.sql.adaptive.coalescePartitions.parallelismFirst
description: ''
description: 'When true, Spark does not respect the target size specified by (spark.sql.adaptive.advisoryPartitionSizeInBytes) (default 64MB) when coalescing contiguous shuffle partitions, but adaptively calculate the target size according to the default parallelism of the Spark cluster.'
enabled: true
level: job
category: tuning
defaultSpark: "true"
- label: spark.sql.adaptive.autoBroadcastJoinThreshold
description: ''
description: 'Configures the maximum size in bytes for a table that will be broadcast to all worker nodes when performing a join. By setting this value to -1, broadcasting can be disabled.'
enabled: true
level: job
category: tuning
- label: spark.sql.files.maxPartitionBytes
description: ''
description: 'The maximum number of bytes to pack into a single partition when reading files. This configuration is effective only when using file-based sources such as Parquet, JSON and ORC.'
enabled: true
level: job
category: tuning
- label: spark.sql.shuffle.partitions
description: ''
description: 'The default number of partitions to use when shuffling data for joins or aggregations. Note: For structured streaming, this configuration cannot be changed between query restarts from the same checkpoint location.'
enabled: true
level: job
category: tuning
defaultSpark: "200"
- label: spark.task.resource.gpu.amount
description: ''
description: 'The GPU resource amount per task when Apache Spark schedules GPU resources. For example, setting the value to 1 means that only one task will run concurrently per executor.'
enabled: true
level: cluster
category: tuning
Original file line number Diff line number Diff line change
Expand Up @@ -719,6 +719,7 @@ class AutoTuner(
}

private def recommendAQEProperties(): Unit = {
// Spark configuration (AQE is enabled by default)
val aqeEnabled = getPropertyValue("spark.sql.adaptive.enabled")
.getOrElse("false").toLowerCase
if (aqeEnabled == "false") {
Expand Down
Loading

0 comments on commit 4b095d5

Please sign in to comment.